Gitly


1 // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module scanner
5 
6 import os
7 import strconv
8 import v.token
9 import v.pref
10 import v.util
11 import v.errors
12 
13 @[markused]
14 const workaround_markused_bug = map[string]int{}
15 
16 const single_quote = `'`
17 const double_quote = `"`
18 // char used as number separator
19 const num_sep = `_`
20 const b_lf = 10
21 const b_cr = 13
22 const backslash = `\\`
23 const digit_table = get_digit_table()
24 const letter_table = get_letter_table()
25 
26 @[direct_array_access]
27 fn get_digit_table() [256]bool {
28     mut res := [256]bool{}
29     for c in 0 .. 256 {
30         res[c] = u8(c).is_digit()
31     }
32     return res
33 }
34 
35 @[direct_array_access]
36 fn get_letter_table() [256]bool {
37     mut res := [256]bool{}
38     for c in 0 .. 256 {
39         res[c] = u8(c).is_letter()
40     }
41     return res
42 }
43 
44 @[minify]
45 pub struct Scanner {
46 pub mut:
47     file_path                   string // '/path/to/file.v'
48     file_base                   string // 'file.v'
49     file_idx                    i16 = -1 // file idx in the global table `filelist`
50     text                        string // the whole text of the file
51     pos                         int = -1 // current position in the file, first character is s.text[0]
52     line_nr                     int // current line number
53     last_nl_pos                 int = -1 // for calculating column
54     is_inside_string            bool // set to true in a string, *at the start* of a ${expr}
55     is_nested_string            bool // '${'abc':-12s}'
56     str_helper_tokens           []u8 = []u8{cap: 16} // ', ", 0 (string interpolation with lcbr), { (block)
57     line_comment                string
58     last_lt                     int = -1 // position of latest <
59     is_print_line_on_error      bool
60     is_print_colored_error      bool
61     is_print_rel_paths_on_error bool
62     quote                       u8   // which quote is used to denote current string: ' or "
63     nr_lines                    int  // total number of lines in the source file that were scanned
64     is_fmt                      bool // Used for v fmt.
65     comments_mode               CommentsMode
66     is_inside_toplvl_statement  bool          // *only* used in comments_mode: .toplevel_comments, toggled by parser
67     all_tokens                  []token.Token // *only* used in comments_mode: .toplevel_comments, contains all tokens
68     tidx                        int
69     eofs                        int
70     max_eofs                    int = 50
71     pref                        &pref.Preferences
72     error_details               []string
73     errors                      []errors.Error
74     warnings                    []errors.Warning
75     notices                     []errors.Notice
76     should_abort                bool // when too many errors/warnings/notices are accumulated, should_abort becomes true, and the scanner should stop
77 
78     // the following are used only inside ident_string, but are here to avoid allocating new arrays for the most common case of strings without escapes
79     all_pos         []int    = []int{cap: 30}
80     u16_escapes_pos []int    = []int{cap: 10} // pos list of \uXXXX
81     u32_escapes_pos []int    = []int{cap: 10} // pos list of \UXXXXXXXX
82     h_escapes_pos   []int    = []int{cap: 10} // pos list of \xXX
83     str_segments    []string = []string{cap: 10}
84 }
85 
86 /*
87 How the .toplevel_comments mode works:
88 
89 In this mode, the scanner scans *everything* at once, before parsing starts,
90 including all the comments, and stores the results in an buffer s.all_tokens.
91 
92 Then .scan() just returns s.all_tokens[ s.tidx++ ] *ignoring* the
93 comment tokens. In other words, by default in this mode, the parser
94 *will not see any comments* inside top level statements, so it has
95 no reason to complain about them.
96 
97 When the parser determines, that it is outside of a top level statement,
98 it tells the scanner to backtrack s.tidx to the current p.tok index,
99 then it changes .is_inside_toplvl_statement to false , and refills its
100 lookahead buffer (i.e. p.peek_tok), from the scanner.
101 
102 In effect, from the parser's point of view, the next tokens, that it will
103 receive with p.next(), will be the same, as if comments are not ignored
104 anymore, *between* top level statements.
105 
106 When the parser determines, that it is going again inside a top level
107 statement, it does the same, this time setting .is_inside_toplvl_statement
108 to true, again refilling the lookahead buffer => calling .next() in this
109 mode, will again ignore all the comment tokens, till the top level statement
110 is finished.
111 */
112 // The different kinds of scanner modes:
113 //
114 // .skip_comments - simplest/fastest, just ignores all comments early.
115 // This mode is used by the compiler itself.
116 //
117 // .parse_comments is used by vfmt. Ideally it should handle inline /* */
118 // comments too, i.e. it returns every kind of comment as a new token.
119 //
120 // .toplevel_comments is used by vdoc, parses *only* top level ones
121 // that are *outside* structs/enums/fns.
122 pub enum CommentsMode {
123     skip_comments
124     parse_comments
125     toplevel_comments
126 }
127 
128 // new scanner from file.
129 pub fn new_scanner_file(file_path string, file_idx i16, comments_mode CommentsMode, pref_ &pref.Preferences) !&Scanner {
130     if !os.is_file(file_path) {
131         return error('${file_path} is not a .v file')
132     }
133     raw_text := util.read_file(file_path) or { return err }
134     mut s := &Scanner{
135         pref:                        pref_
136         text:                        raw_text
137         all_tokens:                  []token.Token{cap: raw_text.len / 3}
138         is_print_line_on_error:      true
139         is_print_colored_error:      true
140         is_print_rel_paths_on_error: true
141         is_fmt:                      pref_.is_fmt
142         comments_mode:               comments_mode
143         file_path:                   file_path
144         file_base:                   os.base(file_path)
145         file_idx:                    file_idx
146     }
147     s.scan_all_tokens_in_buffer()
148     return s
149 }
150 
151 const internally_generated_v_code = 'internally_generated_v_code'
152 
153 // new scanner from string.
154 pub fn new_scanner(text string, comments_mode CommentsMode, pref_ &pref.Preferences) &Scanner {
155     mut s := new_plain_scanner(text, comments_mode, pref_)
156     s.scan_all_tokens_in_buffer()
157     return s
158 }
159 
160 fn new_plain_scanner(text string, comments_mode CommentsMode, pref_ &pref.Preferences) &Scanner {
161     return &Scanner{
162         pref:                        pref_
163         text:                        text
164         all_tokens:                  []token.Token{cap: text.len / 3}
165         is_print_line_on_error:      true
166         is_print_colored_error:      true
167         is_print_rel_paths_on_error: true
168         is_fmt:                      pref_.is_fmt
169         comments_mode:               comments_mode
170         file_path:                   internally_generated_v_code
171         file_base:                   internally_generated_v_code
172     }
173 }
174 
175 @[unsafe]
176 pub fn (mut s Scanner) free() {
177     unsafe {
178         // Note: s.text is not freed here, because it is shared with all other util.read_file instances,
179         // and strings are not reference counted yet:
180         // s.text.free()
181         // .all_tokens however are not shared with anything, and can be freed:
182         s.all_tokens.free()
183     }
184 }
185 
186 @[inline]
187 fn (s &Scanner) should_parse_comment() bool {
188     return s.comments_mode == .parse_comments
189         || (s.comments_mode == .toplevel_comments && !s.is_inside_toplvl_statement)
190 }
191 
192 // Note: this is called by v's parser
193 pub fn (mut s Scanner) set_is_inside_toplevel_statement(newstate bool) {
194     s.is_inside_toplvl_statement = newstate
195 }
196 
197 pub fn (mut s Scanner) set_current_tidx(cidx int) {
198     mut tidx := if cidx < 0 { 0 } else { cidx }
199     tidx = if tidx > s.all_tokens.len { s.all_tokens.len } else { tidx }
200     s.tidx = tidx
201 }
202 
203 @[inline]
204 fn (mut s Scanner) new_token(tok_kind token.Kind, lit string, len int) token.Token {
205     cidx := s.tidx
206     s.tidx++
207     line_offset := if tok_kind == .hash { 0 } else { 1 }
208     mut max_column := s.current_column() - len + 1
209     if max_column < 1 {
210         max_column = 1
211     }
212     return token.Token{
213         kind:     tok_kind
214         lit:      lit
215         line_nr:  s.line_nr + line_offset
216         col:      u16(max_column)
217         pos:      s.pos - len + 1
218         len:      len
219         tidx:     cidx
220         file_idx: s.file_idx
221     }
222 }
223 
224 @[inline]
225 fn (s &Scanner) new_eof_token() token.Token {
226     return token.Token{
227         kind:     .eof
228         lit:      ''
229         line_nr:  s.line_nr + 1
230         col:      u16(s.current_column())
231         pos:      s.pos
232         len:      1
233         tidx:     s.tidx
234         file_idx: s.file_idx
235     }
236 }
237 
238 @[inline]
239 fn (mut s Scanner) new_multiline_token(tok_kind token.Kind, lit string, len int, start_line int) token.Token {
240     cidx := s.tidx
241     s.tidx++
242     mut max_column := s.current_column() - len + 1
243     if max_column < 1 {
244         max_column = 1
245     }
246     return token.Token{
247         kind:     tok_kind
248         lit:      lit
249         line_nr:  start_line + 1
250         col:      u16(max_column)
251         pos:      s.pos - len + 1
252         len:      len
253         tidx:     cidx
254         file_idx: s.file_idx
255     }
256 }
257 
258 @[direct_array_access; inline]
259 fn (mut s Scanner) ident_name() string {
260     start := s.pos
261     s.pos++
262     for s.pos < s.text.len {
263         c := s.text[s.pos]
264         if util.func_char_table[c] {
265             s.pos++
266             continue
267         }
268         break
269     }
270     name := s.text[start..s.pos]
271     s.pos--
272     return name
273 }
274 
275 fn (s &Scanner) num_lit(start int, end int) string {
276     if s.is_fmt {
277         return s.text[start..end]
278     }
279     unsafe {
280         txt := s.text.str
281         mut b := malloc_noscan(end - start + 1) // add a byte for the endstring 0
282         mut i_no_sep := 0
283         for i in start .. end {
284             if txt[i] != num_sep {
285                 b[i_no_sep] = txt[i]
286                 i_no_sep++
287             }
288         }
289         b[i_no_sep] = 0 // C string compatibility
290         return b.vstring_with_len(i_no_sep)
291     }
292 }
293 
294 @[direct_array_access; inline]
295 fn (s &Scanner) number_prefixed_identifier_name(start_pos int, end_pos int) string {
296     if end_pos <= start_pos || !digit_table[s.text[start_pos]] {
297         return ''
298     }
299     mut ident_start := start_pos
300     for ident_start < end_pos
301         && (digit_table[s.text[ident_start]] || s.text[ident_start] == num_sep) {
302         ident_start++
303     }
304     if ident_start >= end_pos || !letter_table[s.text[ident_start]] {
305         return ''
306     }
307     for i in ident_start .. end_pos {
308         if !util.func_char_table[s.text[i]] {
309             return ''
310         }
311     }
312     if s.next_non_space_char(end_pos) !in [`:`, `=`, `,`, `)`, `]`, `}`, `.`, `;`, `\0`] {
313         return ''
314     }
315     return s.text[start_pos..end_pos]
316 }
317 
318 @[direct_array_access; inline]
319 fn (s &Scanner) next_non_space_char(pos int) u8 {
320     for i in pos .. s.text.len {
321         if util.non_whitespace_table[s.text[i]] {
322             return s.text[i]
323         }
324     }
325     return `\0`
326 }
327 
328 @[inline]
329 fn (s &Scanner) pos_from_bounds(start_pos int, end_pos int) token.Pos {
330     return token.Pos{
331         len:      end_pos - start_pos
332         line_nr:  s.line_nr
333         pos:      start_pos
334         col:      u16_col(start_pos - s.last_nl_pos - 1)
335         file_idx: s.file_idx
336     }
337 }
338 
339 @[direct_array_access]
340 fn (mut s Scanner) ident_bin_number() string {
341     mut has_wrong_digit := false
342     mut first_wrong_digit_pos := 0
343     mut first_wrong_digit := `\0`
344     start_pos := s.pos
345     s.pos += 2 // skip '0b'
346     if s.pos < s.text.len && s.text[s.pos] == num_sep {
347         s.error('separator `_` is only valid between digits in a numeric literal')
348     }
349     for s.pos < s.text.len {
350         c := s.text[s.pos]
351         if c == num_sep && s.text[s.pos - 1] == num_sep {
352             s.error('cannot use `_` consecutively')
353         }
354         if !c.is_bin_digit() && c != num_sep {
355             if (!digit_table[c] && !letter_table[c]) || s.is_inside_string || s.is_nested_string {
356                 break
357             } else if !has_wrong_digit {
358                 has_wrong_digit = true
359                 first_wrong_digit_pos = s.pos
360                 first_wrong_digit = c
361             }
362         }
363         s.pos++
364     }
365     if s.text[s.pos - 1] == num_sep {
366         s.pos--
367         s.error('cannot use `_` at the end of a numeric literal')
368     } else if start_pos + 2 == s.pos {
369         s.pos-- // adjust error position
370         s.error('number part of this binary is not provided')
371     } else if has_wrong_digit {
372         s.pos = first_wrong_digit_pos // adjust error position
373         s.error('this binary number has unsuitable digit `${first_wrong_digit.str()}`')
374     }
375     number := s.num_lit(start_pos, s.pos)
376     s.pos--
377     return number
378 }
379 
380 @[direct_array_access]
381 fn (mut s Scanner) ident_hex_number() string {
382     mut has_wrong_digit := false
383     mut first_wrong_digit_pos := 0
384     mut first_wrong_digit := `\0`
385     start_pos := s.pos
386     if s.pos + 2 >= s.text.len {
387         return '0x'
388     }
389     s.pos += 2 // skip '0x'
390     if s.pos < s.text.len && s.text[s.pos] == num_sep {
391         s.error('separator `_` is only valid between digits in a numeric literal')
392     }
393     for s.pos < s.text.len {
394         c := s.text[s.pos]
395         if c == num_sep && s.text[s.pos - 1] == num_sep {
396             s.error('cannot use `_` consecutively')
397         }
398         if !c.is_hex_digit() && c != num_sep {
399             if !letter_table[c] || s.is_inside_string || s.is_nested_string {
400                 break
401             } else if !has_wrong_digit {
402                 has_wrong_digit = true
403                 first_wrong_digit_pos = s.pos
404                 first_wrong_digit = c
405             }
406         }
407         s.pos++
408     }
409     if s.text[s.pos - 1] == num_sep {
410         s.pos--
411         s.error('cannot use `_` at the end of a numeric literal')
412     } else if start_pos + 2 == s.pos {
413         s.pos-- // adjust error position
414         s.error('number part of this hexadecimal is not provided')
415     } else if has_wrong_digit {
416         s.pos = first_wrong_digit_pos // adjust error position
417         s.error('this hexadecimal number has unsuitable digit `${first_wrong_digit.str()}`')
418     }
419     number := s.num_lit(start_pos, s.pos)
420     s.pos--
421     return number
422 }
423 
424 @[direct_array_access]
425 fn (mut s Scanner) ident_oct_number() string {
426     mut has_wrong_digit := false
427     mut first_wrong_digit_pos := 0
428     mut first_wrong_digit := `\0`
429     start_pos := s.pos
430     s.pos += 2 // skip '0o'
431     if s.pos < s.text.len && s.text[s.pos] == num_sep {
432         s.error('separator `_` is only valid between digits in a numeric literal')
433     }
434     for s.pos < s.text.len {
435         c := s.text[s.pos]
436         if c == num_sep && s.text[s.pos - 1] == num_sep {
437             s.error('cannot use `_` consecutively')
438         }
439         if !c.is_oct_digit() && c != num_sep {
440             if (!digit_table[c] && !letter_table[c]) || s.is_inside_string || s.is_nested_string {
441                 break
442             } else if !has_wrong_digit {
443                 has_wrong_digit = true
444                 first_wrong_digit_pos = s.pos
445                 first_wrong_digit = c
446             }
447         }
448         s.pos++
449     }
450     if s.text[s.pos - 1] == num_sep {
451         s.pos--
452         s.error('cannot use `_` at the end of a numeric literal')
453     } else if start_pos + 2 == s.pos {
454         s.pos-- // adjust error position
455         s.error('number part of this octal is not provided')
456     } else if has_wrong_digit {
457         s.pos = first_wrong_digit_pos // adjust error position
458         s.error('this octal number has unsuitable digit `${first_wrong_digit.str()}`')
459     }
460     number := s.num_lit(start_pos, s.pos)
461     s.pos--
462     return number
463 }
464 
465 @[direct_array_access]
466 fn (mut s Scanner) ident_dec_number() string {
467     mut has_wrong_digit := false
468     mut first_wrong_digit_pos := 0
469     mut first_wrong_digit := `\0`
470     start_pos := s.pos
471     // scan integer part
472     for s.pos < s.text.len {
473         c := s.text[s.pos]
474         if c == num_sep && s.text[s.pos - 1] == num_sep {
475             s.error('cannot use `_` consecutively')
476         }
477         if !digit_table[c] && c != num_sep {
478             if !letter_table[c] || c in [`e`, `E`] || s.is_inside_string || s.is_nested_string {
479                 break
480             } else if !has_wrong_digit {
481                 has_wrong_digit = true
482                 first_wrong_digit_pos = s.pos
483                 first_wrong_digit = c
484             }
485         }
486         s.pos++
487     }
488     if s.text[s.pos - 1] == num_sep {
489         s.pos--
490         s.error('cannot use `_` at the end of a numeric literal')
491     }
492     if has_wrong_digit {
493         invalid_ident := s.number_prefixed_identifier_name(start_pos, s.pos)
494         if invalid_ident != '' {
495             s.error_with_pos('identifier name `${invalid_ident}` cannot start with a number', s.pos_from_bounds(start_pos,
496                 s.pos))
497             number := s.num_lit(start_pos, s.pos)
498             s.pos--
499             return number
500         }
501     }
502     mut call_method := false // true for, e.g., 5.str(), 5.5.str(), 5e5.str()
503     mut is_range := false // true for, e.g., 5..10
504     // scan fractional part
505     if s.pos < s.text.len && s.text[s.pos] == `.` {
506         s.pos++
507         if s.pos < s.text.len {
508             // 5.5, 5.5.str()
509             if digit_table[s.text[s.pos]] {
510                 for s.pos < s.text.len {
511                     c := s.text[s.pos]
512                     if !digit_table[c] {
513                         if !letter_table[c] || c in [`e`, `E`] || s.is_inside_string
514                             || s.is_nested_string {
515                             // 5.5.str()
516                             if c == `.` && s.pos + 1 < s.text.len && letter_table[s.text[s.pos + 1]] {
517                                 call_method = true
518                             }
519                             break
520                         } else if !has_wrong_digit {
521                             has_wrong_digit = true
522                             first_wrong_digit_pos = s.pos
523                             first_wrong_digit = c
524                         }
525                     }
526                     s.pos++
527                 }
528             } else if s.text[s.pos] == `.` {
529                 // 5.. (a range)
530                 is_range = true
531                 s.pos--
532             } else if s.text[s.pos] in [`e`, `E`] {
533                 // 5.e5
534             } else if letter_table[s.text[s.pos]] {
535                 // 5.str()
536                 call_method = true
537                 s.pos--
538             } else {
539                 // 5.
540                 mut symbol_length := 0
541                 for i := s.pos - 2; i > 0 && digit_table[s.text[i - 1]]; i-- {
542                     symbol_length++
543                 }
544                 float_symbol := s.text[s.pos - 2 - symbol_length..s.pos - 1]
545                 s.warn('float literals should have a digit after the decimal point, e.g. `${float_symbol}.0`')
546             }
547         }
548     }
549     // scan exponential part
550     mut has_exp := false
551     if s.pos < s.text.len && s.text[s.pos] in [`e`, `E`] && !s.is_inside_string {
552         has_exp = true
553         s.pos++
554         if s.pos < s.text.len && s.text[s.pos] in [`-`, `+`] {
555             s.pos++
556         }
557         for s.pos < s.text.len {
558             c := s.text[s.pos]
559             if !digit_table[c] {
560                 if !letter_table[c] || s.is_inside_string || s.is_nested_string {
561                     // 5e5.str()
562                     if c == `.` && s.pos + 1 < s.text.len && letter_table[s.text[s.pos + 1]] {
563                         call_method = true
564                     }
565                     break
566                 } else if !has_wrong_digit {
567                     has_wrong_digit = true
568                     first_wrong_digit_pos = s.pos
569                     first_wrong_digit = c
570                 }
571             }
572             s.pos++
573         }
574     }
575     if has_wrong_digit {
576         // error check: wrong digit
577         s.pos = first_wrong_digit_pos // adjust error position
578         if !s.pref.translated {
579             s.error('this number has unsuitable digit `${first_wrong_digit.str()}`')
580         }
581     } else if s.text[s.pos - 1] in [`e`, `E`] && !s.is_inside_string {
582         // error check: 5e
583         s.pos-- // adjust error position
584         s.error('exponent has no digits')
585     } else if s.pos < s.text.len && s.text[s.pos] == `.` && !is_range && !call_method {
586         // error check: 1.23.4, 123.e+3.4
587         if has_exp {
588             s.error('exponential part should be integer')
589         } else {
590             s.error('too many decimal points in number')
591         }
592     }
593     number := s.num_lit(start_pos, s.pos)
594     s.pos--
595     return number
596 }
597 
598 fn (mut s Scanner) ident_number() string {
599     if s.expect('0b', s.pos) {
600         return s.ident_bin_number()
601     } else if s.expect('0x', s.pos) {
602         return s.ident_hex_number()
603     } else if s.expect('0o', s.pos) {
604         return s.ident_oct_number()
605     } else {
606         return s.ident_dec_number()
607     }
608 }
609 
610 @[direct_array_access; inline]
611 fn (mut s Scanner) skip_whitespace() {
612     for s.pos < s.text.len {
613         c := s.text[s.pos]
614         if c == 9 || c == 32 {
615             // tabs and spaces are most common
616             s.pos++
617             continue
618         }
619         if c == b_lf {
620             s.inc_line_number()
621             s.pos++
622             continue
623         }
624         if util.non_whitespace_table[c] {
625             return
626         }
627         s.pos++
628     }
629 }
630 
631 fn (mut s Scanner) end_of_file() token.Token {
632     s.eofs++
633     if s.eofs > s.max_eofs {
634         s.line_nr--
635         if s.file_path == internally_generated_v_code {
636             // show a bit more context for that case, since the source may not be easily visible by just inspecting a source file on the filesystem
637             eprintln('> internally_generated_v_code, start: ${s.text#[0..50]}')
638             eprintln('> internally_generated_v_code,   end: ${s.text#[-50..]}')
639             eprintln('> internally_generated_v_code,   len: ${s.text.len}')
640         }
641         panic(
642             'the end of file `${s.file_path}` has been reached ${s.max_eofs} times already, the v parser is probably stuck.\n' +
643             'This should not happen. Please report the bug here, and include the last 2-3 lines of your source code:\n' +
644             'https://github.com/vlang/v/issues/new?labels=Bug&template=bug_report.md')
645     }
646     if s.pos != s.text.len && s.eofs == 1 {
647         s.inc_line_number()
648     }
649     s.pos = s.text.len
650     return s.new_eof_token()
651 }
652 
653 fn (mut s Scanner) scan_all_tokens_in_buffer() {
654     mut timers := util.get_timers()
655     timers.measure_pause('PARSE')
656     util.timing_start('SCAN')
657     defer {
658         util.timing_measure_cumulative('SCAN')
659         timers.measure_resume('PARSE')
660     }
661     s.scan_remaining_text()
662     s.tidx = 0
663     $if trace_scanner ? {
664         for t in s.all_tokens {
665             eprintln('> tidx:${t.tidx:-5} | kind: ${t.kind:-10} | lit.len: ${t.lit.len:-5} | lit: `${t.lit}`')
666         }
667     }
668 }
669 
670 fn (mut s Scanner) scan_remaining_text() {
671     is_skip_comments := s.comments_mode == .skip_comments
672     for {
673         t := s.text_scan()
674         if !(is_skip_comments && t.kind == .comment) {
675             s.all_tokens << t
676             if t.kind == .eof || s.should_abort {
677                 break
678             }
679         }
680     }
681 }
682 
683 @[direct_array_access]
684 pub fn (mut s Scanner) scan() token.Token {
685     for {
686         cidx := s.tidx
687         s.tidx++
688         if cidx >= s.all_tokens.len || s.should_abort {
689             return s.end_of_file()
690         }
691         if s.all_tokens[cidx].kind == .comment && !s.should_parse_comment() {
692             continue
693         }
694         return s.all_tokens[cidx]
695     }
696     return s.new_eof_token()
697 }
698 
699 @[direct_array_access; inline]
700 pub fn (s &Scanner) peek_token(n int) token.Token {
701     idx := s.tidx + n
702     if idx >= s.all_tokens.len || idx < 0 {
703         return s.new_eof_token()
704     }
705     t := s.all_tokens[idx]
706     return t
707 }
708 
709 @[direct_array_access; inline]
710 fn (s &Scanner) look_ahead(n int) u8 {
711     if s.pos + n < s.text.len {
712         return s.text[s.pos + n]
713     } else {
714         return `\0`
715     }
716 }
717 
718 // text_scan returns a single token from the text, and updates the scanner state,
719 // so that it will be ready to get the next token right after that.
720 // See also Scanner.prepare_for_new_text and new_silent_scanner()
721 @[direct_array_access]
722 pub fn (mut s Scanner) text_scan() token.Token {
723     // The for loop here is so that instead of doing
724     // `return s.scan()` (which will use a new call stack frame),
725     // text_scan can just do continue, keeping
726     // memory & stack usage low.
727     // That optimization mostly matters for long sections
728     // of comments and string literals.
729     for {
730         s.pos++
731         if !s.is_inside_string {
732             s.skip_whitespace()
733         }
734         if s.pos >= s.text.len || s.should_abort {
735             return s.end_of_file()
736         }
737         s.skip_whitespace()
738         // end of file
739         if s.pos >= s.text.len {
740             return s.end_of_file()
741         }
742         // handle each char
743         c := s.text[s.pos]
744         nextc := s.look_ahead(1)
745         // name or keyword
746         if util.name_char_table[c] {
747             name := s.ident_name()
748             kind := token.scanner_matcher.find(name)
749             if kind != -1 {
750                 return s.new_token(unsafe { token.Kind(kind) }, name, name.len)
751             }
752             return s.new_token(.name, name, name.len)
753         } else if digit_table[c] || (c == `.` && digit_table[nextc]) {
754             // `123`, `.123`
755             if !s.is_inside_string {
756                 // In C ints with `0` prefix are octal (in V they're decimal), so discarding heading zeros is needed.
757                 mut start_pos := s.pos
758                 for start_pos < s.text.len && s.text[start_pos] == `0` {
759                     start_pos++
760                 }
761                 mut prefix_zero_num := start_pos - s.pos // how many prefix zeros should be jumped
762                 // for 0b, 0o, 0x the heading zero shouldn't be jumped
763                 if start_pos == s.text.len || (c == `0` && !digit_table[s.text[start_pos]]) {
764                     prefix_zero_num--
765                 }
766                 s.pos += prefix_zero_num // jump these zeros
767             }
768             num := s.ident_number()
769             return s.new_token(.number, num, num.len)
770         }
771         // all other tokens
772         match c {
773             `+` {
774                 if nextc == `+` {
775                     s.pos++
776                     return s.new_token(.inc, '', 2)
777                 } else if nextc == `=` {
778                     s.pos++
779                     return s.new_token(.plus_assign, '', 2)
780                 }
781                 return s.new_token(.plus, '', 1)
782             }
783             `-` {
784                 if nextc == `-` {
785                     s.pos++
786                     return s.new_token(.dec, '', 2)
787                 } else if nextc == `=` {
788                     s.pos++
789                     return s.new_token(.minus_assign, '', 2)
790                 }
791                 return s.new_token(.minus, '', 1)
792             }
793             `*` {
794                 if nextc == `*` {
795                     if s.look_ahead(2) == `=` {
796                         s.pos += 2
797                         return s.new_token(.power_assign, '', 3)
798                     }
799                     s.pos++
800                     return s.new_token(.power, '', 2)
801                 }
802                 if nextc == `=` {
803                     s.pos++
804                     return s.new_token(.mult_assign, '', 2)
805                 }
806                 return s.new_token(.mul, '', 1)
807             }
808             `^` {
809                 if nextc == `=` {
810                     s.pos++
811                     return s.new_token(.xor_assign, '', 2)
812                 }
813                 return s.new_token(.xor, '', 1)
814             }
815             `%` {
816                 if nextc == `=` {
817                     s.pos++
818                     return s.new_token(.mod_assign, '', 2)
819                 }
820                 return s.new_token(.mod, '', 1)
821             }
822             `?` {
823                 return s.new_token(.question, '?', 1)
824             }
825             single_quote, double_quote {
826                 if s.is_likely_unclosed_string_interpolation(c) {
827                     s.error_with_pos('expected `}` to close string interpolation', token.Pos{
828                         len:       1
829                         line_nr:   s.line_nr
830                         pos:       s.pos
831                         col:       u16_col(s.current_column() - 1)
832                         file_idx:  s.file_idx
833                         last_line: s.line_nr
834                     })
835                 }
836                 s.str_helper_tokens << c
837                 start_line := s.line_nr
838                 ident_string := s.ident_string()
839                 return s.new_multiline_token(.string, ident_string, ident_string.len + 2,
840                     start_line) // + two quotes
841             }
842             `\`` {
843                 // ` // apostrophe balance comment. do not remove
844                 ident_char := s.ident_char()
845                 return s.new_token(.chartoken, ident_char, ident_char.len + 2) // + two quotes
846             }
847             `(` {
848                 return s.new_token(.lpar, '', 1)
849             }
850             `)` {
851                 return s.new_token(.rpar, '', 1)
852             }
853             `[` {
854                 return s.new_token(.lsbr, '', 1)
855             }
856             `]` {
857                 return s.new_token(.rsbr, '', 1)
858             }
859             `{` {
860                 // Keep interpolation helper state only while scanning string interpolation.
861                 if s.str_helper_tokens.len > 0 {
862                     // Skip { in `${` in strings
863                     if 255 != s.str_quote() {
864                         s.str_helper_tokens << 0
865                     } else {
866                         s.str_helper_tokens << c
867                     }
868                 }
869                 if s.is_inside_string && s.text[s.pos - 1] == `$` {
870                     continue
871                 }
872                 return s.new_token(.lcbr, '', 1)
873             }
874             `$` {
875                 if s.is_inside_string {
876                     return s.new_token(.str_dollar, '', 1)
877                 } else {
878                     return s.new_token(.dollar, '', 1)
879                 }
880             }
881             `}` {
882                 // s = `hello ${name} !`
883                 if s.str_helper_tokens.len > 0 {
884                     s.str_helper_tokens.delete_last()
885                     quote := s.str_quote()
886                     if 255 != quote {
887                         if s.pos < s.text.len - 1 {
888                             s.pos++
889                         } else {
890                             s.error('unfinished string literal')
891                         }
892                         if s.text[s.pos] == quote {
893                             s.is_inside_string = false
894                             s.str_helper_tokens.delete_last()
895                             return s.new_token(.string, '', 1)
896                         }
897                         ident_string := s.ident_string()
898                         return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
899                     }
900                 }
901                 return s.new_token(.rcbr, '', 1)
902             }
903             `&` {
904                 if nextc == `&` {
905                     if s.look_ahead(2) == `=` {
906                         s.pos += 2
907                         return s.new_token(.boolean_and_assign, '', 3)
908                     }
909                 }
910                 if nextc == `=` {
911                     s.pos++
912                     return s.new_token(.and_assign, '', 2)
913                 }
914                 afternextc := s.look_ahead(2)
915                 if nextc == `&` && (afternextc.is_space() || afternextc == `!`) {
916                     s.pos++
917                     return s.new_token(.and, '', 2)
918                 }
919                 return s.new_token(.amp, '', 1)
920             }
921             `|` {
922                 if nextc == `|` {
923                     if s.look_ahead(2) == `=` {
924                         s.pos += 2
925                         return s.new_token(.boolean_or_assign, '', 3)
926                     }
927                     s.pos++
928                     return s.new_token(.logical_or, '', 2)
929                 }
930                 if nextc == `=` {
931                     s.pos++
932                     return s.new_token(.or_assign, '', 2)
933                 }
934                 return s.new_token(.pipe, '', 1)
935             }
936             `,` {
937                 return s.new_token(.comma, '', 1)
938             }
939             `@` {
940                 // @[attr]
941                 if s.text[s.pos + 1] == `[` {
942                     return s.new_token(.at, '', 1)
943                 }
944                 mut name := ''
945                 if nextc != `\0` {
946                     s.pos++
947                     name = s.ident_name()
948                 }
949                 if s.is_fmt {
950                     return s.new_token(.name, '@' + name, name.len + 1)
951                 }
952                 // @FN, @STRUCT, @MOD etc. See full list in token.valid_at_tokens
953                 if '@' + name in token.valid_at_tokens || name.starts_with('cc') { // `=@cccond` in inline assembly
954                     return s.new_token(.at, '@' + name, name.len + 1)
955                 }
956                 if !token.is_key(name) {
957                     // If name is all uppercase, the user is probably looking for a compile time variable ("at-token")
958                     if name.is_upper() {
959                         comptime_vars := token.valid_at_tokens.join(', ')
960                         s.add_error_detail('available compile time variables: ${comptime_vars}'.wrap(
961                             width: 90
962                         ))
963                     }
964                     s.error('@ must be used before keywords or compile time variables (e.g. `@type string` or `@FN`)')
965                 } else {
966                     // s.note('@keyword is being deprecated and then removed from V. Use `keyword_` or a different name (e.g. `typ` instead of `type`)')
967                 }
968                 return s.new_token(.name, name, name.len)
969             }
970             `.` {
971                 if nextc == `.` {
972                     s.pos++
973                     if s.pos + 1 < s.text.len && s.text[s.pos + 1] == `.` {
974                         s.pos++
975                         return s.new_token(.ellipsis, '', 3)
976                     }
977                     return s.new_token(.dotdot, '', 2)
978                 }
979                 return s.new_token(.dot, '', 1)
980             }
981             `#` {
982                 // manage gated arrays/strings
983                 if nextc == `[` {
984                     s.pos++
985                     return s.new_token(.nilsbr, '', 2)
986                 }
987 
988                 start := s.pos + 1
989                 s.ignore_line()
990                 if nextc == `!` {
991                     // treat shebang line (#!) as a comment
992                     comment := s.text[start - 1..s.pos].trim_space()
993                     if s.line_nr != 1 {
994                         comment_pos := token.Pos{
995                             line_nr:  s.line_nr - 1
996                             len:      comment.len
997                             pos:      start
998                             col:      u16_col(s.current_column() - comment.len)
999                             file_idx: s.file_idx
1000                         }
1001                         s.error_with_pos('a shebang is only valid at the top of the file',
1002                             comment_pos)
1003                     }
1004                     // s.fgenln('// shebang line "$s.line_comment"')
1005                     return s.new_token(.comment, comment, comment.len + 2)
1006                 }
1007                 hash := s.text[start..s.pos].trim_space()
1008                 return s.new_token(.hash, hash, hash.len + 2)
1009             }
1010             `>` {
1011                 if nextc == `=` {
1012                     s.pos++
1013                     return s.new_token(.ge, '', 2)
1014                 } else if nextc == `>` {
1015                     if s.pos + 2 < s.text.len {
1016                         if s.text[s.pos + 2] == `=` {
1017                             s.pos += 2
1018                             return s.new_token(.right_shift_assign, '', 3)
1019                         } else if s.text[s.pos + 2] == `>` {
1020                             if s.pos + 3 < s.text.len && s.text[s.pos + 3] == `=` {
1021                                 s.pos += 3
1022                                 return s.new_token(.unsigned_right_shift_assign, '', 4)
1023                             }
1024                             s.pos += 2
1025                             return s.new_token(.unsigned_right_shift, '', 3)
1026                         }
1027                     }
1028                     s.pos++
1029                     return s.new_token(.right_shift, '', 2)
1030                 }
1031                 return s.new_token(.gt, '', 1)
1032             }
1033             `<` {
1034                 if nextc == `=` {
1035                     s.pos++
1036                     return s.new_token(.le, '', 2)
1037                 } else if nextc == `<` {
1038                     if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
1039                         s.pos += 2
1040                         return s.new_token(.left_shift_assign, '', 3)
1041                     }
1042                     s.pos++
1043                     return s.new_token(.left_shift, '', 2)
1044                 } else if nextc == `-` {
1045                     s.pos++
1046                     return s.new_token(.arrow, '', 2)
1047                 } else {
1048                     s.last_lt = s.pos
1049                     return s.new_token(.lt, '', 1)
1050                 }
1051             }
1052             `=` {
1053                 if nextc == `=` {
1054                     s.pos++
1055                     return s.new_token(.eq, '', 2)
1056                 } else {
1057                     return s.new_token(.assign, '', 1)
1058                 }
1059             }
1060             `:` {
1061                 if nextc == `=` {
1062                     s.pos++
1063                     return s.new_token(.decl_assign, '', 2)
1064                 } else {
1065                     return s.new_token(.colon, '', 1)
1066                 }
1067             }
1068             `;` {
1069                 return s.new_token(.semicolon, '', 1)
1070             }
1071             `!` {
1072                 if nextc == `=` {
1073                     s.pos++
1074                     return s.new_token(.ne, '', 2)
1075                 } else if s.text.len > s.pos + 3 && nextc == `i` && s.text[s.pos + 2] == `n`
1076                     && s.text[s.pos + 3].is_space() {
1077                     s.pos += 2
1078                     return s.new_token(.not_in, '', 3)
1079                 } else if s.text.len > s.pos + 3 && nextc == `i` && s.text[s.pos + 2] == `s`
1080                     && s.text[s.pos + 3].is_space() {
1081                     s.pos += 2
1082                     return s.new_token(.not_is, '', 3)
1083                 } else {
1084                     return s.new_token(.not, '!', 1)
1085                 }
1086             }
1087             `~` {
1088                 return s.new_token(.bit_not, '', 1)
1089             }
1090             `/` {
1091                 if nextc == `=` {
1092                     s.pos++
1093                     return s.new_token(.div_assign, '', 2)
1094                 }
1095                 if nextc == `/` { // Single line comments
1096                     start := s.pos + 1
1097                     s.ignore_line()
1098                     mut comment_line_end := s.pos
1099                     if s.text[s.pos - 1] == b_cr {
1100                         comment_line_end--
1101                         s.pos--
1102                     }
1103                     // fix line_nr, \n was read; the comment is marked on the next line
1104                     s.pos--
1105                     s.line_nr--
1106                     if s.should_parse_comment() {
1107                         s.line_comment = s.text[start + 1..comment_line_end]
1108                         mut comment := s.line_comment
1109                         // Find out if this comment is on its own line (for vfmt)
1110                         mut is_separate_line_comment := true
1111                         for j := start - 2; j >= 0 && s.text[j] != b_lf; j-- {
1112                             if s.text[j] !in [`\t`, ` `] {
1113                                 is_separate_line_comment = false
1114                             }
1115                         }
1116                         if is_separate_line_comment {
1117                             // Note: ´\x01´ is used to preserve the initial whitespace in comments
1118                             //     that are on a separate line
1119                             comment = '\x01' + comment
1120                         }
1121                         return s.new_token(.comment, comment, s.line_comment.len + 2)
1122                     }
1123                     // Skip the comment (return the next token)
1124                     continue
1125                 } else if nextc == `*` { // Multiline comments
1126                     start := s.pos + 2
1127                     start_line := s.line_nr
1128                     mut nest_count := 1
1129                     s.pos++
1130                     // Skip comment
1131                     for nest_count > 0 && s.pos < s.text.len - 1 {
1132                         s.pos++
1133                         if s.pos >= s.text.len - 1 {
1134                             s.line_nr = start_line
1135                             s.error('unterminated multiline comment')
1136                         }
1137                         if s.text[s.pos] == b_lf {
1138                             s.inc_line_number()
1139                             continue
1140                         }
1141                         if s.expect('/*', s.pos) && s.text[s.pos + 2] != `/` {
1142                             nest_count++
1143                             continue
1144                         }
1145                         if s.expect('*/', s.pos) {
1146                             nest_count--
1147                         }
1148                     }
1149                     s.pos++
1150                     if s.should_parse_comment() {
1151                         mut comment := s.text[start..(s.pos - 1)]
1152                         if !comment.contains('\n') {
1153                             comment_pos := token.Pos{
1154                                 line_nr:  start_line
1155                                 len:      comment.len + 4
1156                                 pos:      start
1157                                 col:      u16_col(s.current_column() - comment.len - 4)
1158                                 file_idx: s.file_idx
1159                             }
1160                             if !s.pref.is_fmt {
1161                                 s.error_with_pos('inline comment is deprecated, please use line comment',
1162                                     comment_pos)
1163                             }
1164                             comment = '\x01' + comment.trim(' ')
1165                         }
1166                         return s.new_multiline_token(.comment, comment, comment.len + 4, start_line)
1167                     }
1168                     // Skip if not in fmt mode
1169                     continue
1170                 }
1171                 return s.new_token(.div, '', 1)
1172             }
1173             else {}
1174         }
1175 
1176         $if windows {
1177             if c == `\0` {
1178                 return s.end_of_file()
1179             }
1180         }
1181         s.invalid_character()
1182         break
1183     }
1184     return s.end_of_file()
1185 }
1186 
1187 fn (mut s Scanner) invalid_character() {
1188     len := utf8_char_len(s.text[s.pos])
1189     end := int_min(s.pos + len, s.text.len)
1190     c := s.text[s.pos..end]
1191     s.error('invalid character `${c}`')
1192 }
1193 
1194 @[inline]
1195 fn (s &Scanner) current_column() int {
1196     return s.pos - s.last_nl_pos
1197 }
1198 
1199 @[direct_array_access]
1200 fn (s &Scanner) count_symbol_before(p int, sym u8) int {
1201     mut count := 0
1202     for i := p; i >= 0; i-- {
1203         if s.text[i] != sym {
1204             break
1205         }
1206         count++
1207     }
1208     return count
1209 }
1210 
1211 // ident_string returns a lexed V string, starting from the current position in the text
1212 // it supports r'strings', c'strings', interpolated 'strings' and "strings", and hex
1213 // escapes in them (except in the r'strings' where the content is returned verbatim)
1214 @[direct_array_access]
1215 pub fn (mut s Scanner) ident_string() string {
1216     quote := s.str_quote()
1217     if 255 == quote {
1218         return ''
1219     }
1220     s.quote = quote
1221     // determines if it is a nested string
1222     if s.is_inside_string {
1223         s.is_nested_string = true
1224     } else {
1225         s.is_nested_string = false
1226     }
1227     lspos := token.Pos{
1228         line_nr:  s.line_nr
1229         pos:      s.pos
1230         col:      u16(s.pos - s.last_nl_pos - 1)
1231         file_idx: s.file_idx
1232     }
1233     q := s.text[s.pos]
1234     is_quote := q in [single_quote, double_quote]
1235     is_raw := is_quote && s.pos > 0 && s.text[s.pos - 1] == `r` && !s.is_inside_string
1236     is_cstr := is_quote && s.pos > 0 && s.text[s.pos - 1] == `c` && !s.is_inside_string
1237     mut n_cr_chars := 0
1238     mut start := s.pos
1239     start_char := s.text[start]
1240     if start_char == s.quote {
1241         start++
1242     } else if start_char == b_lf {
1243         s.inc_line_number()
1244     }
1245     s.is_inside_string = false
1246     s.u16_escapes_pos.clear()
1247     s.u32_escapes_pos.clear()
1248     s.h_escapes_pos.clear()
1249     mut backslash_count := if start_char == backslash { 1 } else { 0 }
1250     for {
1251         s.pos++
1252         if s.pos >= s.text.len {
1253             if lspos.line_nr + 1 < s.line_nr {
1254                 s.add_error_detail_with_pos('literal started here', lspos)
1255             }
1256             s.error('unfinished string literal')
1257             break
1258         }
1259         c := s.text[s.pos]
1260         prevc := s.text[s.pos - 1]
1261         if c == backslash {
1262             backslash_count++
1263         }
1264         // end of string
1265         if c == s.quote && (is_raw || backslash_count & 1 == 0) {
1266             // handle '123\\' backslash at the end
1267             break
1268         }
1269         if c == b_cr {
1270             n_cr_chars++
1271         }
1272         if c == b_lf {
1273             s.inc_line_number()
1274         }
1275         // Escape `\x` `\u` `\U`
1276         if backslash_count & 1 == 1 && !is_raw && !is_cstr {
1277             // Escape `\x`
1278             if c == `x` {
1279                 if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
1280                     && s.text[s.pos + 2].is_hex_digit()) {
1281                     s.error(r'`\x` used without two following hex digits')
1282                 }
1283                 s.h_escapes_pos << s.pos - 1
1284             }
1285             // Escape `\u`
1286             if c == `u` {
1287                 if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote
1288                     || s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
1289                     || !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
1290                     || !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit() {
1291                     s.error(r'`\u` incomplete 16 bit unicode character value')
1292                 }
1293                 s.u16_escapes_pos << s.pos - 1
1294             }
1295             // Escape `\U`
1296             if c == `U` {
1297                 if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote
1298                     || s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
1299                     || s.text[s.pos + 5] == s.quote || s.text[s.pos + 6] == s.quote
1300                     || s.text[s.pos + 7] == s.quote || s.text[s.pos + 8] == s.quote
1301                     || !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
1302                     || !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit()
1303                     || !s.text[s.pos + 5].is_hex_digit() || !s.text[s.pos + 6].is_hex_digit()
1304                     || !s.text[s.pos + 7].is_hex_digit() || !s.text[s.pos + 8].is_hex_digit() {
1305                     s.error(r'`\U` incomplete 32 bit unicode character value')
1306                 }
1307                 s.u32_escapes_pos << s.pos - 1
1308             }
1309             // Unknown escape sequence
1310             if !util.is_escape_sequence(c) && !digit_table[c] && c != `\n` {
1311                 s.error('`${c.ascii_str()}` unknown escape sequence')
1312             }
1313         }
1314         // ${var} (ignore in vfmt mode) (skip \$)
1315         if prevc == `$` && c == `{` && !is_raw
1316             && s.count_symbol_before(s.pos - 2, backslash) & 1 == 0 {
1317             s.is_inside_string = true
1318             // so that s.pos points to $ at the next step
1319             s.pos -= 2
1320             break
1321         }
1322         if c != backslash {
1323             backslash_count = 0
1324         }
1325     }
1326     mut lit := ''
1327     mut end := s.pos
1328     if s.is_inside_string {
1329         end++
1330     }
1331     if start <= s.pos {
1332         mut string_so_far := s.text[start..end]
1333         if !s.is_fmt {
1334             mut segment_idx := 0
1335             s.str_segments.clear()
1336             if s.u16_escapes_pos.len + s.h_escapes_pos.len + s.u32_escapes_pos.len > 0 {
1337                 s.all_pos.clear()
1338                 s.all_pos << s.u16_escapes_pos
1339                 s.all_pos << s.u32_escapes_pos
1340                 s.all_pos << s.h_escapes_pos
1341                 s.all_pos.sort()
1342 
1343                 for pos in s.all_pos {
1344                     s.str_segments << string_so_far[segment_idx..(pos - start)]
1345                     segment_idx = pos - start
1346                     if pos in s.u16_escapes_pos {
1347                         decoded := s.decode_u16_escape_single(string_so_far, segment_idx)
1348                         s.str_segments << decoded.segment
1349                         segment_idx = decoded.idx
1350                     }
1351                     if pos in s.u32_escapes_pos {
1352                         decoded := s.decode_u32_escape_single(string_so_far, segment_idx)
1353                         s.str_segments << decoded.segment
1354                         segment_idx = decoded.idx
1355                     }
1356                     if pos in s.h_escapes_pos {
1357                         decoded := s.decode_h_escape_single(string_so_far, segment_idx)
1358                         s.str_segments << decoded.segment
1359                         segment_idx = decoded.idx
1360                     }
1361                 }
1362             }
1363             if segment_idx < string_so_far.len {
1364                 s.str_segments << string_so_far[segment_idx..]
1365             }
1366             string_so_far = s.str_segments.join('')
1367         }
1368 
1369         if n_cr_chars > 0 {
1370             string_so_far = string_so_far.replace('\r', '')
1371         }
1372         if !is_raw && string_so_far.contains('\\\n') {
1373             lit = trim_slash_line_break(string_so_far)
1374         } else {
1375             lit = string_so_far
1376         }
1377     }
1378     if s.text[end] == quote {
1379         s.str_helper_tokens.delete_last()
1380     }
1381     return lit
1382 }
1383 
1384 struct DecodedEscape {
1385     idx     int
1386     segment string
1387 }
1388 
1389 fn (mut s Scanner) decode_h_escape_single(str string, idx int) DecodedEscape {
1390     end_idx := idx + 4 // "\xXX".len == 4
1391     if idx + 2 > str.len || end_idx > str.len {
1392         s.error_with_pos('unfinished single hex escape started at', s.current_pos())
1393         return DecodedEscape{0, ''}
1394     }
1395     // notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
1396     return DecodedEscape{
1397         idx:     end_idx
1398         segment: [u8(strconv.parse_uint(str[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
1399     }
1400 }
1401 
1402 // only handle single-byte inline escapes like '\xc0'
1403 fn (mut s Scanner) decode_h_escapes(sinput string, start int, escapes_pos []int) string {
1404     if escapes_pos.len == 0 {
1405         return sinput
1406     }
1407     mut ss := []string{cap: escapes_pos.len * 2 + 1}
1408     ss << sinput[..escapes_pos.first() - start]
1409     for i, pos in escapes_pos {
1410         idx := pos - start
1411         decoded := s.decode_h_escape_single(sinput, idx)
1412         if decoded.idx > sinput.len {
1413             s.error_with_pos('unfinished hex escape started at', s.current_pos())
1414             return ''
1415         }
1416         ss << decoded.segment
1417         if i + 1 < escapes_pos.len {
1418             ss << sinput[decoded.idx..escapes_pos[i + 1] - start]
1419         } else {
1420             ss << sinput[decoded.idx..]
1421         }
1422     }
1423     return ss.join('')
1424 }
1425 
1426 // handle single-byte inline octal escapes like '\###'
1427 fn (mut s Scanner) decode_o_escapes(sinput string, start int, escapes_pos []int) string {
1428     if escapes_pos.len == 0 {
1429         return sinput
1430     }
1431     mut ss := []string{cap: escapes_pos.len}
1432     ss << sinput[..escapes_pos.first() - start] // everything before the first escape code position
1433     for i, pos in escapes_pos {
1434         idx := pos - start
1435         end_idx := idx + 4 // "\XXX".len == 4
1436         if end_idx > sinput.len {
1437             s.error_with_pos('unfinished octal escape started at', s.current_pos())
1438             return ''
1439         }
1440         // notice this function doesn't do any decoding... it just replaces '\141' with the byte 0o141
1441         octal_byte := u8(strconv.parse_uint(sinput[idx + 1..end_idx], 8, 8) or { 0 })
1442         ss << [octal_byte].bytestr()
1443         if i + 1 < escapes_pos.len {
1444             ss << sinput[end_idx..escapes_pos[i + 1] - start]
1445         } else {
1446             ss << sinput[end_idx..]
1447         }
1448     }
1449     return ss.join('')
1450 }
1451 
1452 fn (mut s Scanner) decode_u16_escape_single(str string, idx int) DecodedEscape {
1453     end_idx := idx + 6 // "\uXXXX".len == 6
1454     if idx + 2 > str.len || end_idx > str.len {
1455         s.error_with_pos('unfinished u16 escape started at', s.current_pos())
1456         return DecodedEscape{0, ''}
1457     }
1458     escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
1459     // Check if Escaped Code Point is invalid or not
1460     if rune(escaped_code_point).length_in_bytes() == -1 {
1461         s.error('invalid unicode point `${str}`')
1462     }
1463     return DecodedEscape{end_idx, utf32_to_str(u32(escaped_code_point))}
1464 }
1465 
1466 // decode a single 16 bit unicode escaped rune into its utf-8 bytes
1467 fn (mut s Scanner) decode_u16erune(str string) string {
1468     decoded := s.decode_u16_escape_single(str, 0)
1469     if str.len == decoded.idx {
1470         return decoded.segment
1471     }
1472     mut ss := []string{cap: 2}
1473     ss << decoded.segment
1474     ss << str[decoded.idx..]
1475     return ss.join('')
1476 }
1477 
1478 fn (mut s Scanner) decode_u32_escape_single(str string, idx int) DecodedEscape {
1479     end_idx := idx + 10 // "\uXXXXXXXX".len == 10
1480     if idx + 2 > str.len || end_idx > str.len {
1481         s.error_with_pos('unfinished u32 escape started at', s.current_pos())
1482         return DecodedEscape{0, ''}
1483     }
1484     escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
1485     // Check if Escaped Code Point is invalid or not
1486     if rune(escaped_code_point).length_in_bytes() == -1 {
1487         s.error('invalid unicode point `${str}`')
1488     }
1489     return DecodedEscape{end_idx, utf32_to_str(u32(escaped_code_point))}
1490 }
1491 
1492 // decode a single 32 bit unicode escaped rune into its utf-8 bytes
1493 fn (mut s Scanner) decode_u32erune(str string) string {
1494     decoded := s.decode_u32_escape_single(str, 0)
1495     if str.len == decoded.idx {
1496         return decoded.segment
1497     }
1498     mut ss := []string{cap: 2}
1499     ss << decoded.segment
1500     ss << str[decoded.idx..]
1501     return ss.join('')
1502 }
1503 
1504 fn trim_slash_line_break(s string) string {
1505     mut start := 0
1506     mut ret_str := s
1507     for {
1508         // find the position of the first `\` followed by a newline, after `start`:
1509         idx := ret_str.index_after('\\\n', start) or { break }
1510         start = idx
1511         // Here, ret_str[idx] is \, and ret_str[idx+1] is newline.
1512         // Depending on the number of backslashes before the newline, we should either
1513         // treat the last one and the whitespace after it as line-break, or just ignore it:
1514         mut nbackslashes := 0
1515         for eidx := idx; eidx >= 0 && ret_str[eidx] == `\\`; eidx-- {
1516             nbackslashes++
1517         }
1518         // eprintln('>> start: ${start:-5} | nbackslashes: ${nbackslashes:-5} | ret_str: $ret_str')
1519         if idx == 0 || (nbackslashes & 1) == 1 {
1520             ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
1521         } else {
1522             // ensure the loop will terminate, when we could not strip anything:
1523             start++
1524         }
1525     }
1526     return ret_str
1527 }
1528 
1529 /// ident_char is called when a backtick "single-char" is parsed from the code
1530 /// it is needed because some runes (chars) are written with escape sequences
1531 /// the string it returns should be a standardized, simplified version of the character
1532 /// as it would appear in source code
1533 /// possibilities:
1534 ///   single chars like `a`, `b` => 'a', 'b'
1535 ///   escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
1536 ///   escaped single hex bytes like `\x01`, `\x61` => '\x01', 'a'
1537 ///   escaped unicode literals like `\u2605`
1538 ///   escaped unicode 32 literals like `\U00002605`
1539 ///   escaped utf8 runes in hex like `\xe2\x98\x85` => (★)
1540 ///   escaped utf8 runes in octal like `\342\230\205` => (★)
1541 pub fn (mut s Scanner) ident_char() string {
1542     lspos := token.Pos{
1543         line_nr:  s.line_nr
1544         pos:      s.pos
1545         col:      u16(s.pos - s.last_nl_pos - 1)
1546         file_idx: s.file_idx
1547     }
1548 
1549     start := s.pos // the string position of the first backtick char
1550     slash := `\\`
1551     mut len := 0
1552 
1553     // set flags for advanced escapes first
1554     escaped_hex := s.expect('\\x', start + 1) && s.text.len > start + 3
1555         && s.text[start + 3].is_hex_digit()
1556     escaped_unicode_16 := s.expect('\\u', start + 1) && s.text.len > start + 3
1557         && s.text[start + 3].is_hex_digit()
1558     escaped_unicode_32 := s.expect('\\U', start + 1) && s.text.len > start + 3
1559         && s.text[start + 3].is_hex_digit()
1560     escaped_octal := !escaped_hex && !escaped_unicode_16 && !escaped_unicode_32
1561         && s.expect('\\', start + 1) && s.text.len > start + 2 && s.text[start + 2].is_oct_digit()
1562 
1563     // walk the string to get characters up to the next backtick
1564     for {
1565         s.pos++
1566         if s.pos >= s.text.len {
1567             break
1568         }
1569         if s.text[s.pos] != slash {
1570             len++
1571         }
1572         double_slash := s.expect('\\\\', s.pos - 2)
1573         if s.text[s.pos] == `\`` && (s.text[s.pos - 1] != slash || double_slash) {
1574             // ` // apostrophe balance comment. do not remove
1575             if double_slash {
1576                 len++
1577             }
1578             break
1579         }
1580     }
1581     len--
1582     mut c := s.text[start + 1..s.pos]
1583     if s.is_fmt {
1584         return c
1585     }
1586     if len != 1 {
1587         // the string inside the backticks is longer than one character
1588         // but we might only have one rune... attempt to decode escapes
1589         // if the content expresses an escape code, it will have an even number of characters
1590         // e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605
1591         // we don't handle binary escape codes in rune literals
1592         orig := c
1593         if c.len & 1 == 0
1594             && (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) {
1595             if escaped_unicode_16 {
1596                 // there can only be one, so attempt to decode it now
1597                 c = s.decode_u16erune(c)
1598             } else if escaped_unicode_32 {
1599                 // there can only be one, so attempt to decode it now
1600                 c = s.decode_u32erune(c)
1601             } else {
1602                 // find escape sequence start positions
1603                 mut escapes_pos := []int{}
1604                 for i, v in c {
1605                     if v == `\\` {
1606                         escapes_pos << i
1607                     }
1608                 }
1609                 if escaped_hex {
1610                     c = s.decode_h_escapes(c, 0, escapes_pos)
1611                 } else {
1612                     c = s.decode_o_escapes(c, 0, escapes_pos)
1613                 }
1614             }
1615         }
1616 
1617         u := c.runes()
1618         if u.len != 1 {
1619             mut err_info := []string{cap: u.len}
1620             mut i := 0
1621             for i < u.len {
1622                 if u[i] != `\\` || i == u.len - 1 {
1623                     err_info << '`${u[i]}`'
1624                     i++
1625                     continue
1626                 }
1627                 err_info << '`\\${u[i + 1]}`'
1628                 i += 2
1629             }
1630             if escaped_hex || escaped_unicode_16 || escaped_unicode_32 {
1631                 s.error_with_pos('invalid character literal `${orig}` => `${c}` ([${err_info.join(', ')}]) (escape sequence did not refer to a singular rune)',
1632                     lspos)
1633             } else if u.len == 0 {
1634                 s.add_error_detail('use quotes for strings, backticks for characters')
1635                 s.error_with_pos('invalid empty character literal `${orig}`', lspos)
1636             } else {
1637                 s.add_error_detail('use quotes for strings, backticks for characters')
1638                 s.error_with_pos('invalid character literal `${orig}` => `${c}` ([${err_info.join(', ')}]) (more than one character)',
1639                     lspos)
1640             }
1641         }
1642     } else if c.ends_with('\n') {
1643         s.add_error_detail('use quotes for strings, backticks for characters')
1644         s.error_with_pos('invalid character literal, use \`\\n\` instead', lspos)
1645     } else if c.len > len {
1646         ch := c[c.len - 1]
1647         if !util.is_escape_sequence(ch) && !digit_table[ch] {
1648             s.error('`${ch.ascii_str()}` unknown escape sequence')
1649         }
1650     }
1651     // Escapes a `'` character
1652     if c == "'" {
1653         return '\\' + c
1654     }
1655     return c
1656 }
1657 
1658 @[direct_array_access; inline]
1659 fn (s &Scanner) expect(want string, start_pos int) bool {
1660     end_pos := start_pos + want.len
1661     if start_pos < 0 || end_pos < 0 || start_pos >= s.text.len || end_pos > s.text.len {
1662         return false
1663     }
1664     for pos in start_pos .. end_pos {
1665         if s.text[pos] != want[pos - start_pos] {
1666             return false
1667         }
1668     }
1669     return true
1670 }
1671 
1672 @[inline]
1673 fn (mut s Scanner) ignore_line() {
1674     s.eat_to_end_of_line()
1675     s.inc_line_number()
1676 }
1677 
1678 @[direct_array_access; inline]
1679 fn (mut s Scanner) eat_to_end_of_line() {
1680     for s.pos < s.text.len && s.text[s.pos] != b_lf {
1681         s.pos++
1682     }
1683 }
1684 
1685 @[direct_array_access; inline]
1686 fn (mut s Scanner) inc_line_number() {
1687     s.last_nl_pos = if s.text.len - 1 > s.pos { s.pos } else { s.text.len - 1 }
1688     s.line_nr++
1689     if s.line_nr > s.nr_lines {
1690         s.nr_lines = s.line_nr
1691     }
1692 }
1693 
1694 pub fn (mut s Scanner) current_pos() token.Pos {
1695     return token.Pos{
1696         line_nr:  s.line_nr
1697         pos:      s.pos
1698         col:      u16_col(s.current_column() - 1)
1699         file_idx: s.file_idx
1700     }
1701 }
1702 
1703 pub fn (mut s Scanner) note(msg string) {
1704     if s.pref.notes_are_errors {
1705         s.error_with_pos(msg, s.current_pos())
1706         return
1707     }
1708     pos := token.Pos{
1709         line_nr:  s.line_nr
1710         pos:      s.pos
1711         file_idx: s.file_idx
1712     }
1713     if s.pref.output_mode == .stdout && !s.pref.check_only {
1714         util.show_compiler_message('notice:', pos: pos, file_path: s.file_path, message: msg)
1715     } else {
1716         s.notices << errors.Notice{
1717             file_path: s.file_path
1718             pos:       pos
1719             reporter:  .scanner
1720             message:   msg
1721         }
1722     }
1723 }
1724 
1725 // call this *before* calling error or warn
1726 pub fn (mut s Scanner) add_error_detail(msg string) {
1727     s.error_details << msg
1728 }
1729 
1730 pub fn (mut s Scanner) add_error_detail_with_pos(msg string, pos token.Pos) {
1731     s.add_error_detail('\n' + util.formatted_error('details:', msg, s.file_path, pos))
1732 }
1733 
1734 fn (mut s Scanner) eat_details() string {
1735     mut details := ''
1736     if s.error_details.len > 0 {
1737         details = s.error_details.join('\n')
1738         s.error_details = []
1739     }
1740     return details
1741 }
1742 
1743 pub fn (mut s Scanner) warn(msg string) {
1744     s.warn_with_pos(msg, s.current_pos())
1745 }
1746 
1747 pub fn (mut s Scanner) warn_with_pos(msg string, pos token.Pos) {
1748     if s.pref.warns_are_errors {
1749         s.error_with_pos(msg, pos)
1750         return
1751     }
1752     details := s.eat_details()
1753     if s.pref.output_mode == .stdout && !s.pref.check_only {
1754         util.show_compiler_message('warning:',
1755             pos:       pos
1756             file_path: s.file_path
1757             message:   msg
1758             details:   details
1759         )
1760     } else {
1761         if s.pref.message_limit >= 0 && s.warnings.len >= s.pref.message_limit {
1762             s.should_abort = true
1763             return
1764         }
1765         s.warnings << errors.Warning{
1766             file_path: s.file_path
1767             pos:       pos
1768             reporter:  .scanner
1769             message:   msg
1770             details:   details
1771         }
1772     }
1773 }
1774 
1775 pub fn (mut s Scanner) error(msg string) {
1776     s.error_with_pos(msg, s.current_pos())
1777 }
1778 
1779 pub fn (mut s Scanner) error_with_pos(msg string, pos token.Pos) {
1780     details := s.eat_details()
1781     if s.pref.output_mode == .stdout && !s.pref.check_only {
1782         util.show_compiler_message('error:',
1783             pos:       pos
1784             file_path: s.file_path
1785             message:   msg
1786             details:   details
1787         )
1788         exit(1)
1789     } else {
1790         if s.pref.fatal_errors {
1791             util.show_compiler_message('error:',
1792                 pos:       pos
1793                 file_path: s.file_path
1794                 message:   msg
1795                 details:   details
1796             )
1797             exit(1)
1798         }
1799         if s.pref.message_limit >= 0 && s.errors.len >= s.pref.message_limit {
1800             s.should_abort = true
1801             return
1802         }
1803         s.errors << errors.Error{
1804             file_path: s.file_path
1805             pos:       pos
1806             reporter:  .scanner
1807             message:   msg
1808             details:   details
1809         }
1810     }
1811 }
1812 
1813 fn (mut s Scanner) trace[T](fbase string, x &T) {
1814     if s.file_base == fbase {
1815         println('> s.trace | ${fbase:-10s} | ${voidptr(x):16} | ${x}')
1816     }
1817 }
1818 
1819 // prepare_for_new_text resets the internal state of the scanner,
1820 // so that it can be reused for scanning the new text, given by `text`,
1821 // using a subsequent s.scan_text() call, to get the token corresponding to the text.
1822 pub fn (mut s Scanner) prepare_for_new_text(text string) {
1823     s.text = text
1824     s.pos = -1
1825     s.tidx = 0
1826     s.all_tokens.clear()
1827     s.errors.clear()
1828     s.error_details.clear()
1829     s.warnings.clear()
1830     s.notices.clear()
1831     s.str_helper_tokens.clear()
1832     s.str_segments.clear()
1833     s.all_pos.clear()
1834     s.u16_escapes_pos.clear()
1835     s.u32_escapes_pos.clear()
1836     s.h_escapes_pos.clear()
1837     s.should_abort = false
1838     s.eofs = 0
1839     s.nr_lines = 0
1840     s.line_nr = 0
1841     s.last_nl_pos = -1
1842     s.is_inside_toplvl_statement = false
1843     s.is_inside_string = false
1844     s.is_nested_string = false
1845     s.last_lt = -1
1846     s.quote = 0
1847 }
1848 
1849 // new_silent_scanner returns a new scanner instance, setup to just set internal flags and append errors
1850 // to its .errors field, *without aborting the program*. It is mainly useful for programs that want to
1851 // lex potentially invalid V source code repeatedly, and do their own error handling (checking .errors.len).
1852 pub fn new_silent_scanner() &Scanner {
1853     mut p := pref.new_preferences()
1854     p.output_mode = .silent
1855     return &Scanner{
1856         pref: p
1857     }
1858 }
1859 
1860 @[direct_array_access]
1861 fn (s Scanner) str_quote() u8 {
1862     if s.str_helper_tokens.len == 0 {
1863         return 255
1864     }
1865     c := s.str_helper_tokens[s.str_helper_tokens.len - 1]
1866     if c in [`'`, `"`] {
1867         return c
1868     }
1869     return 255
1870 }
1871 
1872 @[direct_array_access; inline]
1873 fn (s &Scanner) is_likely_unclosed_string_interpolation(current_quote u8) bool {
1874     if current_quote != s.quote || s.str_helper_tokens.len == 0 || s.str_quote() != 255
1875         || s.all_tokens.len == 0 {
1876         return false
1877     }
1878     prev_tok := s.all_tokens[s.all_tokens.len - 1]
1879     return prev_tok.kind in [.number, .string, .chartoken, .rpar, .rsbr, .rcbr]
1880 }
1881 
1882 @[inline]
1883 fn u16_col(col int) u16 {
1884     return if col < 0 { u16(0) } else { u16(col) }
1885 }
1886