v2 / vlib / v / scanner / scanner.v
1885 lines · 1802 sloc · 52.85 KB · 1004bcd9f704bcc24b8c8aa8bb714683233a05eb
Raw
1// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module scanner
5
6import os
7import strconv
8import v.token
9import v.pref
10import v.util
11import v.errors
12
13@[markused]
14const workaround_markused_bug = map[string]int{}
15
16const single_quote = `'`
17const double_quote = `"`
18// char used as number separator
19const num_sep = `_`
20const b_lf = 10
21const b_cr = 13
22const backslash = `\\`
23const digit_table = get_digit_table()
24const letter_table = get_letter_table()
25
26@[direct_array_access]
27fn get_digit_table() [256]bool {
28 mut res := [256]bool{}
29 for c in 0 .. 256 {
30 res[c] = u8(c).is_digit()
31 }
32 return res
33}
34
35@[direct_array_access]
36fn get_letter_table() [256]bool {
37 mut res := [256]bool{}
38 for c in 0 .. 256 {
39 res[c] = u8(c).is_letter()
40 }
41 return res
42}
43
44@[minify]
45pub struct Scanner {
46pub mut:
47 file_path string // '/path/to/file.v'
48 file_base string // 'file.v'
49 file_idx i16 = -1 // file idx in the global table `filelist`
50 text string // the whole text of the file
51 pos int = -1 // current position in the file, first character is s.text[0]
52 line_nr int // current line number
53 last_nl_pos int = -1 // for calculating column
54 is_inside_string bool // set to true in a string, *at the start* of a ${expr}
55 is_nested_string bool // '${'abc':-12s}'
56 str_helper_tokens []u8 = []u8{cap: 16} // ', ", 0 (string interpolation with lcbr), { (block)
57 line_comment string
58 last_lt int = -1 // position of latest <
59 is_print_line_on_error bool
60 is_print_colored_error bool
61 is_print_rel_paths_on_error bool
62 quote u8 // which quote is used to denote current string: ' or "
63 nr_lines int // total number of lines in the source file that were scanned
64 is_fmt bool // Used for v fmt.
65 comments_mode CommentsMode
66 is_inside_toplvl_statement bool // *only* used in comments_mode: .toplevel_comments, toggled by parser
67 all_tokens []token.Token // *only* used in comments_mode: .toplevel_comments, contains all tokens
68 tidx int
69 eofs int
70 max_eofs int = 50
71 pref &pref.Preferences
72 error_details []string
73 errors []errors.Error
74 warnings []errors.Warning
75 notices []errors.Notice
76 should_abort bool // when too many errors/warnings/notices are accumulated, should_abort becomes true, and the scanner should stop
77
78 // the following are used only inside ident_string, but are here to avoid allocating new arrays for the most common case of strings without escapes
79 all_pos []int = []int{cap: 30}
80 u16_escapes_pos []int = []int{cap: 10} // pos list of \uXXXX
81 u32_escapes_pos []int = []int{cap: 10} // pos list of \UXXXXXXXX
82 h_escapes_pos []int = []int{cap: 10} // pos list of \xXX
83 str_segments []string = []string{cap: 10}
84}
85
86/*
87How the .toplevel_comments mode works:
88
89In this mode, the scanner scans *everything* at once, before parsing starts,
90including all the comments, and stores the results in an buffer s.all_tokens.
91
92Then .scan() just returns s.all_tokens[ s.tidx++ ] *ignoring* the
93comment tokens. In other words, by default in this mode, the parser
94*will not see any comments* inside top level statements, so it has
95no reason to complain about them.
96
97When the parser determines, that it is outside of a top level statement,
98it tells the scanner to backtrack s.tidx to the current p.tok index,
99then it changes .is_inside_toplvl_statement to false , and refills its
100lookahead buffer (i.e. p.peek_tok), from the scanner.
101
102In effect, from the parser's point of view, the next tokens, that it will
103receive with p.next(), will be the same, as if comments are not ignored
104anymore, *between* top level statements.
105
106When the parser determines, that it is going again inside a top level
107statement, it does the same, this time setting .is_inside_toplvl_statement
108to true, again refilling the lookahead buffer => calling .next() in this
109mode, will again ignore all the comment tokens, till the top level statement
110is finished.
111*/
112// The different kinds of scanner modes:
113//
114// .skip_comments - simplest/fastest, just ignores all comments early.
115// This mode is used by the compiler itself.
116//
117// .parse_comments is used by vfmt. Ideally it should handle inline /* */
118// comments too, i.e. it returns every kind of comment as a new token.
119//
120// .toplevel_comments is used by vdoc, parses *only* top level ones
121// that are *outside* structs/enums/fns.
122pub enum CommentsMode {
123 skip_comments
124 parse_comments
125 toplevel_comments
126}
127
128// new scanner from file.
129pub fn new_scanner_file(file_path string, file_idx i16, comments_mode CommentsMode, pref_ &pref.Preferences) !&Scanner {
130 if !os.is_file(file_path) {
131 return error('${file_path} is not a .v file')
132 }
133 raw_text := util.read_file(file_path) or { return err }
134 mut s := &Scanner{
135 pref: pref_
136 text: raw_text
137 all_tokens: []token.Token{cap: raw_text.len / 3}
138 is_print_line_on_error: true
139 is_print_colored_error: true
140 is_print_rel_paths_on_error: true
141 is_fmt: pref_.is_fmt
142 comments_mode: comments_mode
143 file_path: file_path
144 file_base: os.base(file_path)
145 file_idx: file_idx
146 }
147 s.scan_all_tokens_in_buffer()
148 return s
149}
150
151const internally_generated_v_code = 'internally_generated_v_code'
152
153// new scanner from string.
154pub fn new_scanner(text string, comments_mode CommentsMode, pref_ &pref.Preferences) &Scanner {
155 mut s := new_plain_scanner(text, comments_mode, pref_)
156 s.scan_all_tokens_in_buffer()
157 return s
158}
159
160fn new_plain_scanner(text string, comments_mode CommentsMode, pref_ &pref.Preferences) &Scanner {
161 return &Scanner{
162 pref: pref_
163 text: text
164 all_tokens: []token.Token{cap: text.len / 3}
165 is_print_line_on_error: true
166 is_print_colored_error: true
167 is_print_rel_paths_on_error: true
168 is_fmt: pref_.is_fmt
169 comments_mode: comments_mode
170 file_path: internally_generated_v_code
171 file_base: internally_generated_v_code
172 }
173}
174
175@[unsafe]
176pub fn (mut s Scanner) free() {
177 unsafe {
178 // Note: s.text is not freed here, because it is shared with all other util.read_file instances,
179 // and strings are not reference counted yet:
180 // s.text.free()
181 // .all_tokens however are not shared with anything, and can be freed:
182 s.all_tokens.free()
183 }
184}
185
186@[inline]
187fn (s &Scanner) should_parse_comment() bool {
188 return s.comments_mode == .parse_comments
189 || (s.comments_mode == .toplevel_comments && !s.is_inside_toplvl_statement)
190}
191
192// Note: this is called by v's parser
193pub fn (mut s Scanner) set_is_inside_toplevel_statement(newstate bool) {
194 s.is_inside_toplvl_statement = newstate
195}
196
197pub fn (mut s Scanner) set_current_tidx(cidx int) {
198 mut tidx := if cidx < 0 { 0 } else { cidx }
199 tidx = if tidx > s.all_tokens.len { s.all_tokens.len } else { tidx }
200 s.tidx = tidx
201}
202
203@[inline]
204fn (mut s Scanner) new_token(tok_kind token.Kind, lit string, len int) token.Token {
205 cidx := s.tidx
206 s.tidx++
207 line_offset := if tok_kind == .hash { 0 } else { 1 }
208 mut max_column := s.current_column() - len + 1
209 if max_column < 1 {
210 max_column = 1
211 }
212 return token.Token{
213 kind: tok_kind
214 lit: lit
215 line_nr: s.line_nr + line_offset
216 col: u16(max_column)
217 pos: s.pos - len + 1
218 len: len
219 tidx: cidx
220 file_idx: s.file_idx
221 }
222}
223
224@[inline]
225fn (s &Scanner) new_eof_token() token.Token {
226 return token.Token{
227 kind: .eof
228 lit: ''
229 line_nr: s.line_nr + 1
230 col: u16(s.current_column())
231 pos: s.pos
232 len: 1
233 tidx: s.tidx
234 file_idx: s.file_idx
235 }
236}
237
238@[inline]
239fn (mut s Scanner) new_multiline_token(tok_kind token.Kind, lit string, len int, start_line int) token.Token {
240 cidx := s.tidx
241 s.tidx++
242 mut max_column := s.current_column() - len + 1
243 if max_column < 1 {
244 max_column = 1
245 }
246 return token.Token{
247 kind: tok_kind
248 lit: lit
249 line_nr: start_line + 1
250 col: u16(max_column)
251 pos: s.pos - len + 1
252 len: len
253 tidx: cidx
254 file_idx: s.file_idx
255 }
256}
257
258@[direct_array_access; inline]
259fn (mut s Scanner) ident_name() string {
260 start := s.pos
261 s.pos++
262 for s.pos < s.text.len {
263 c := s.text[s.pos]
264 if util.func_char_table[c] {
265 s.pos++
266 continue
267 }
268 break
269 }
270 name := s.text[start..s.pos]
271 s.pos--
272 return name
273}
274
275fn (s &Scanner) num_lit(start int, end int) string {
276 if s.is_fmt {
277 return s.text[start..end]
278 }
279 unsafe {
280 txt := s.text.str
281 mut b := malloc_noscan(end - start + 1) // add a byte for the endstring 0
282 mut i_no_sep := 0
283 for i in start .. end {
284 if txt[i] != num_sep {
285 b[i_no_sep] = txt[i]
286 i_no_sep++
287 }
288 }
289 b[i_no_sep] = 0 // C string compatibility
290 return b.vstring_with_len(i_no_sep)
291 }
292}
293
294@[direct_array_access; inline]
295fn (s &Scanner) number_prefixed_identifier_name(start_pos int, end_pos int) string {
296 if end_pos <= start_pos || !digit_table[s.text[start_pos]] {
297 return ''
298 }
299 mut ident_start := start_pos
300 for ident_start < end_pos
301 && (digit_table[s.text[ident_start]] || s.text[ident_start] == num_sep) {
302 ident_start++
303 }
304 if ident_start >= end_pos || !letter_table[s.text[ident_start]] {
305 return ''
306 }
307 for i in ident_start .. end_pos {
308 if !util.func_char_table[s.text[i]] {
309 return ''
310 }
311 }
312 if s.next_non_space_char(end_pos) !in [`:`, `=`, `,`, `)`, `]`, `}`, `.`, `;`, `\0`] {
313 return ''
314 }
315 return s.text[start_pos..end_pos]
316}
317
318@[direct_array_access; inline]
319fn (s &Scanner) next_non_space_char(pos int) u8 {
320 for i in pos .. s.text.len {
321 if util.non_whitespace_table[s.text[i]] {
322 return s.text[i]
323 }
324 }
325 return `\0`
326}
327
328@[inline]
329fn (s &Scanner) pos_from_bounds(start_pos int, end_pos int) token.Pos {
330 return token.Pos{
331 len: end_pos - start_pos
332 line_nr: s.line_nr
333 pos: start_pos
334 col: u16_col(start_pos - s.last_nl_pos - 1)
335 file_idx: s.file_idx
336 }
337}
338
339@[direct_array_access]
340fn (mut s Scanner) ident_bin_number() string {
341 mut has_wrong_digit := false
342 mut first_wrong_digit_pos := 0
343 mut first_wrong_digit := `\0`
344 start_pos := s.pos
345 s.pos += 2 // skip '0b'
346 if s.pos < s.text.len && s.text[s.pos] == num_sep {
347 s.error('separator `_` is only valid between digits in a numeric literal')
348 }
349 for s.pos < s.text.len {
350 c := s.text[s.pos]
351 if c == num_sep && s.text[s.pos - 1] == num_sep {
352 s.error('cannot use `_` consecutively')
353 }
354 if !c.is_bin_digit() && c != num_sep {
355 if (!digit_table[c] && !letter_table[c]) || s.is_inside_string || s.is_nested_string {
356 break
357 } else if !has_wrong_digit {
358 has_wrong_digit = true
359 first_wrong_digit_pos = s.pos
360 first_wrong_digit = c
361 }
362 }
363 s.pos++
364 }
365 if s.text[s.pos - 1] == num_sep {
366 s.pos--
367 s.error('cannot use `_` at the end of a numeric literal')
368 } else if start_pos + 2 == s.pos {
369 s.pos-- // adjust error position
370 s.error('number part of this binary is not provided')
371 } else if has_wrong_digit {
372 s.pos = first_wrong_digit_pos // adjust error position
373 s.error('this binary number has unsuitable digit `${first_wrong_digit.str()}`')
374 }
375 number := s.num_lit(start_pos, s.pos)
376 s.pos--
377 return number
378}
379
380@[direct_array_access]
381fn (mut s Scanner) ident_hex_number() string {
382 mut has_wrong_digit := false
383 mut first_wrong_digit_pos := 0
384 mut first_wrong_digit := `\0`
385 start_pos := s.pos
386 if s.pos + 2 >= s.text.len {
387 return '0x'
388 }
389 s.pos += 2 // skip '0x'
390 if s.pos < s.text.len && s.text[s.pos] == num_sep {
391 s.error('separator `_` is only valid between digits in a numeric literal')
392 }
393 for s.pos < s.text.len {
394 c := s.text[s.pos]
395 if c == num_sep && s.text[s.pos - 1] == num_sep {
396 s.error('cannot use `_` consecutively')
397 }
398 if !c.is_hex_digit() && c != num_sep {
399 if !letter_table[c] || s.is_inside_string || s.is_nested_string {
400 break
401 } else if !has_wrong_digit {
402 has_wrong_digit = true
403 first_wrong_digit_pos = s.pos
404 first_wrong_digit = c
405 }
406 }
407 s.pos++
408 }
409 if s.text[s.pos - 1] == num_sep {
410 s.pos--
411 s.error('cannot use `_` at the end of a numeric literal')
412 } else if start_pos + 2 == s.pos {
413 s.pos-- // adjust error position
414 s.error('number part of this hexadecimal is not provided')
415 } else if has_wrong_digit {
416 s.pos = first_wrong_digit_pos // adjust error position
417 s.error('this hexadecimal number has unsuitable digit `${first_wrong_digit.str()}`')
418 }
419 number := s.num_lit(start_pos, s.pos)
420 s.pos--
421 return number
422}
423
424@[direct_array_access]
425fn (mut s Scanner) ident_oct_number() string {
426 mut has_wrong_digit := false
427 mut first_wrong_digit_pos := 0
428 mut first_wrong_digit := `\0`
429 start_pos := s.pos
430 s.pos += 2 // skip '0o'
431 if s.pos < s.text.len && s.text[s.pos] == num_sep {
432 s.error('separator `_` is only valid between digits in a numeric literal')
433 }
434 for s.pos < s.text.len {
435 c := s.text[s.pos]
436 if c == num_sep && s.text[s.pos - 1] == num_sep {
437 s.error('cannot use `_` consecutively')
438 }
439 if !c.is_oct_digit() && c != num_sep {
440 if (!digit_table[c] && !letter_table[c]) || s.is_inside_string || s.is_nested_string {
441 break
442 } else if !has_wrong_digit {
443 has_wrong_digit = true
444 first_wrong_digit_pos = s.pos
445 first_wrong_digit = c
446 }
447 }
448 s.pos++
449 }
450 if s.text[s.pos - 1] == num_sep {
451 s.pos--
452 s.error('cannot use `_` at the end of a numeric literal')
453 } else if start_pos + 2 == s.pos {
454 s.pos-- // adjust error position
455 s.error('number part of this octal is not provided')
456 } else if has_wrong_digit {
457 s.pos = first_wrong_digit_pos // adjust error position
458 s.error('this octal number has unsuitable digit `${first_wrong_digit.str()}`')
459 }
460 number := s.num_lit(start_pos, s.pos)
461 s.pos--
462 return number
463}
464
465@[direct_array_access]
466fn (mut s Scanner) ident_dec_number() string {
467 mut has_wrong_digit := false
468 mut first_wrong_digit_pos := 0
469 mut first_wrong_digit := `\0`
470 start_pos := s.pos
471 // scan integer part
472 for s.pos < s.text.len {
473 c := s.text[s.pos]
474 if c == num_sep && s.text[s.pos - 1] == num_sep {
475 s.error('cannot use `_` consecutively')
476 }
477 if !digit_table[c] && c != num_sep {
478 if !letter_table[c] || c in [`e`, `E`] || s.is_inside_string || s.is_nested_string {
479 break
480 } else if !has_wrong_digit {
481 has_wrong_digit = true
482 first_wrong_digit_pos = s.pos
483 first_wrong_digit = c
484 }
485 }
486 s.pos++
487 }
488 if s.text[s.pos - 1] == num_sep {
489 s.pos--
490 s.error('cannot use `_` at the end of a numeric literal')
491 }
492 if has_wrong_digit {
493 invalid_ident := s.number_prefixed_identifier_name(start_pos, s.pos)
494 if invalid_ident != '' {
495 s.error_with_pos('identifier name `${invalid_ident}` cannot start with a number', s.pos_from_bounds(start_pos,
496 s.pos))
497 number := s.num_lit(start_pos, s.pos)
498 s.pos--
499 return number
500 }
501 }
502 mut call_method := false // true for, e.g., 5.str(), 5.5.str(), 5e5.str()
503 mut is_range := false // true for, e.g., 5..10
504 // scan fractional part
505 if s.pos < s.text.len && s.text[s.pos] == `.` {
506 s.pos++
507 if s.pos < s.text.len {
508 // 5.5, 5.5.str()
509 if digit_table[s.text[s.pos]] {
510 for s.pos < s.text.len {
511 c := s.text[s.pos]
512 if !digit_table[c] {
513 if !letter_table[c] || c in [`e`, `E`] || s.is_inside_string
514 || s.is_nested_string {
515 // 5.5.str()
516 if c == `.` && s.pos + 1 < s.text.len && letter_table[s.text[s.pos + 1]] {
517 call_method = true
518 }
519 break
520 } else if !has_wrong_digit {
521 has_wrong_digit = true
522 first_wrong_digit_pos = s.pos
523 first_wrong_digit = c
524 }
525 }
526 s.pos++
527 }
528 } else if s.text[s.pos] == `.` {
529 // 5.. (a range)
530 is_range = true
531 s.pos--
532 } else if s.text[s.pos] in [`e`, `E`] {
533 // 5.e5
534 } else if letter_table[s.text[s.pos]] {
535 // 5.str()
536 call_method = true
537 s.pos--
538 } else {
539 // 5.
540 mut symbol_length := 0
541 for i := s.pos - 2; i > 0 && digit_table[s.text[i - 1]]; i-- {
542 symbol_length++
543 }
544 float_symbol := s.text[s.pos - 2 - symbol_length..s.pos - 1]
545 s.warn('float literals should have a digit after the decimal point, e.g. `${float_symbol}.0`')
546 }
547 }
548 }
549 // scan exponential part
550 mut has_exp := false
551 if s.pos < s.text.len && s.text[s.pos] in [`e`, `E`] && !s.is_inside_string {
552 has_exp = true
553 s.pos++
554 if s.pos < s.text.len && s.text[s.pos] in [`-`, `+`] {
555 s.pos++
556 }
557 for s.pos < s.text.len {
558 c := s.text[s.pos]
559 if !digit_table[c] {
560 if !letter_table[c] || s.is_inside_string || s.is_nested_string {
561 // 5e5.str()
562 if c == `.` && s.pos + 1 < s.text.len && letter_table[s.text[s.pos + 1]] {
563 call_method = true
564 }
565 break
566 } else if !has_wrong_digit {
567 has_wrong_digit = true
568 first_wrong_digit_pos = s.pos
569 first_wrong_digit = c
570 }
571 }
572 s.pos++
573 }
574 }
575 if has_wrong_digit {
576 // error check: wrong digit
577 s.pos = first_wrong_digit_pos // adjust error position
578 if !s.pref.translated {
579 s.error('this number has unsuitable digit `${first_wrong_digit.str()}`')
580 }
581 } else if s.text[s.pos - 1] in [`e`, `E`] && !s.is_inside_string {
582 // error check: 5e
583 s.pos-- // adjust error position
584 s.error('exponent has no digits')
585 } else if s.pos < s.text.len && s.text[s.pos] == `.` && !is_range && !call_method {
586 // error check: 1.23.4, 123.e+3.4
587 if has_exp {
588 s.error('exponential part should be integer')
589 } else {
590 s.error('too many decimal points in number')
591 }
592 }
593 number := s.num_lit(start_pos, s.pos)
594 s.pos--
595 return number
596}
597
598fn (mut s Scanner) ident_number() string {
599 if s.expect('0b', s.pos) {
600 return s.ident_bin_number()
601 } else if s.expect('0x', s.pos) {
602 return s.ident_hex_number()
603 } else if s.expect('0o', s.pos) {
604 return s.ident_oct_number()
605 } else {
606 return s.ident_dec_number()
607 }
608}
609
610@[direct_array_access; inline]
611fn (mut s Scanner) skip_whitespace() {
612 for s.pos < s.text.len {
613 c := s.text[s.pos]
614 if c == 9 || c == 32 {
615 // tabs and spaces are most common
616 s.pos++
617 continue
618 }
619 if c == b_lf {
620 s.inc_line_number()
621 s.pos++
622 continue
623 }
624 if util.non_whitespace_table[c] {
625 return
626 }
627 s.pos++
628 }
629}
630
631fn (mut s Scanner) end_of_file() token.Token {
632 s.eofs++
633 if s.eofs > s.max_eofs {
634 s.line_nr--
635 if s.file_path == internally_generated_v_code {
636 // show a bit more context for that case, since the source may not be easily visible by just inspecting a source file on the filesystem
637 eprintln('> internally_generated_v_code, start: ${s.text#[0..50]}')
638 eprintln('> internally_generated_v_code, end: ${s.text#[-50..]}')
639 eprintln('> internally_generated_v_code, len: ${s.text.len}')
640 }
641 panic(
642 'the end of file `${s.file_path}` has been reached ${s.max_eofs} times already, the v parser is probably stuck.\n' +
643 'This should not happen. Please report the bug here, and include the last 2-3 lines of your source code:\n' +
644 'https://github.com/vlang/v/issues/new?labels=Bug&template=bug_report.md')
645 }
646 if s.pos != s.text.len && s.eofs == 1 {
647 s.inc_line_number()
648 }
649 s.pos = s.text.len
650 return s.new_eof_token()
651}
652
653fn (mut s Scanner) scan_all_tokens_in_buffer() {
654 mut timers := util.get_timers()
655 timers.measure_pause('PARSE')
656 util.timing_start('SCAN')
657 defer {
658 util.timing_measure_cumulative('SCAN')
659 timers.measure_resume('PARSE')
660 }
661 s.scan_remaining_text()
662 s.tidx = 0
663 $if trace_scanner ? {
664 for t in s.all_tokens {
665 eprintln('> tidx:${t.tidx:-5} | kind: ${t.kind:-10} | lit.len: ${t.lit.len:-5} | lit: `${t.lit}`')
666 }
667 }
668}
669
670fn (mut s Scanner) scan_remaining_text() {
671 is_skip_comments := s.comments_mode == .skip_comments
672 for {
673 t := s.text_scan()
674 if !(is_skip_comments && t.kind == .comment) {
675 s.all_tokens << t
676 if t.kind == .eof || s.should_abort {
677 break
678 }
679 }
680 }
681}
682
683@[direct_array_access]
684pub fn (mut s Scanner) scan() token.Token {
685 for {
686 cidx := s.tidx
687 s.tidx++
688 if cidx >= s.all_tokens.len || s.should_abort {
689 return s.end_of_file()
690 }
691 if s.all_tokens[cidx].kind == .comment && !s.should_parse_comment() {
692 continue
693 }
694 return s.all_tokens[cidx]
695 }
696 return s.new_eof_token()
697}
698
699@[direct_array_access; inline]
700pub fn (s &Scanner) peek_token(n int) token.Token {
701 idx := s.tidx + n
702 if idx >= s.all_tokens.len || idx < 0 {
703 return s.new_eof_token()
704 }
705 t := s.all_tokens[idx]
706 return t
707}
708
709@[direct_array_access; inline]
710fn (s &Scanner) look_ahead(n int) u8 {
711 if s.pos + n < s.text.len {
712 return s.text[s.pos + n]
713 } else {
714 return `\0`
715 }
716}
717
718// text_scan returns a single token from the text, and updates the scanner state,
719// so that it will be ready to get the next token right after that.
720// See also Scanner.prepare_for_new_text and new_silent_scanner()
721@[direct_array_access]
722pub fn (mut s Scanner) text_scan() token.Token {
723 // The for loop here is so that instead of doing
724 // `return s.scan()` (which will use a new call stack frame),
725 // text_scan can just do continue, keeping
726 // memory & stack usage low.
727 // That optimization mostly matters for long sections
728 // of comments and string literals.
729 for {
730 s.pos++
731 if !s.is_inside_string {
732 s.skip_whitespace()
733 }
734 if s.pos >= s.text.len || s.should_abort {
735 return s.end_of_file()
736 }
737 s.skip_whitespace()
738 // end of file
739 if s.pos >= s.text.len {
740 return s.end_of_file()
741 }
742 // handle each char
743 c := s.text[s.pos]
744 nextc := s.look_ahead(1)
745 // name or keyword
746 if util.name_char_table[c] {
747 name := s.ident_name()
748 kind := token.scanner_matcher.find(name)
749 if kind != -1 {
750 return s.new_token(unsafe { token.Kind(kind) }, name, name.len)
751 }
752 return s.new_token(.name, name, name.len)
753 } else if digit_table[c] || (c == `.` && digit_table[nextc]) {
754 // `123`, `.123`
755 if !s.is_inside_string {
756 // In C ints with `0` prefix are octal (in V they're decimal), so discarding heading zeros is needed.
757 mut start_pos := s.pos
758 for start_pos < s.text.len && s.text[start_pos] == `0` {
759 start_pos++
760 }
761 mut prefix_zero_num := start_pos - s.pos // how many prefix zeros should be jumped
762 // for 0b, 0o, 0x the heading zero shouldn't be jumped
763 if start_pos == s.text.len || (c == `0` && !digit_table[s.text[start_pos]]) {
764 prefix_zero_num--
765 }
766 s.pos += prefix_zero_num // jump these zeros
767 }
768 num := s.ident_number()
769 return s.new_token(.number, num, num.len)
770 }
771 // all other tokens
772 match c {
773 `+` {
774 if nextc == `+` {
775 s.pos++
776 return s.new_token(.inc, '', 2)
777 } else if nextc == `=` {
778 s.pos++
779 return s.new_token(.plus_assign, '', 2)
780 }
781 return s.new_token(.plus, '', 1)
782 }
783 `-` {
784 if nextc == `-` {
785 s.pos++
786 return s.new_token(.dec, '', 2)
787 } else if nextc == `=` {
788 s.pos++
789 return s.new_token(.minus_assign, '', 2)
790 }
791 return s.new_token(.minus, '', 1)
792 }
793 `*` {
794 if nextc == `*` {
795 if s.look_ahead(2) == `=` {
796 s.pos += 2
797 return s.new_token(.power_assign, '', 3)
798 }
799 s.pos++
800 return s.new_token(.power, '', 2)
801 }
802 if nextc == `=` {
803 s.pos++
804 return s.new_token(.mult_assign, '', 2)
805 }
806 return s.new_token(.mul, '', 1)
807 }
808 `^` {
809 if nextc == `=` {
810 s.pos++
811 return s.new_token(.xor_assign, '', 2)
812 }
813 return s.new_token(.xor, '', 1)
814 }
815 `%` {
816 if nextc == `=` {
817 s.pos++
818 return s.new_token(.mod_assign, '', 2)
819 }
820 return s.new_token(.mod, '', 1)
821 }
822 `?` {
823 return s.new_token(.question, '?', 1)
824 }
825 single_quote, double_quote {
826 if s.is_likely_unclosed_string_interpolation(c) {
827 s.error_with_pos('expected `}` to close string interpolation', token.Pos{
828 len: 1
829 line_nr: s.line_nr
830 pos: s.pos
831 col: u16_col(s.current_column() - 1)
832 file_idx: s.file_idx
833 last_line: s.line_nr
834 })
835 }
836 s.str_helper_tokens << c
837 start_line := s.line_nr
838 ident_string := s.ident_string()
839 return s.new_multiline_token(.string, ident_string, ident_string.len + 2,
840 start_line) // + two quotes
841 }
842 `\`` {
843 // ` // apostrophe balance comment. do not remove
844 ident_char := s.ident_char()
845 return s.new_token(.chartoken, ident_char, ident_char.len + 2) // + two quotes
846 }
847 `(` {
848 return s.new_token(.lpar, '', 1)
849 }
850 `)` {
851 return s.new_token(.rpar, '', 1)
852 }
853 `[` {
854 return s.new_token(.lsbr, '', 1)
855 }
856 `]` {
857 return s.new_token(.rsbr, '', 1)
858 }
859 `{` {
860 // Keep interpolation helper state only while scanning string interpolation.
861 if s.str_helper_tokens.len > 0 {
862 // Skip { in `${` in strings
863 if 255 != s.str_quote() {
864 s.str_helper_tokens << 0
865 } else {
866 s.str_helper_tokens << c
867 }
868 }
869 if s.is_inside_string && s.text[s.pos - 1] == `$` {
870 continue
871 }
872 return s.new_token(.lcbr, '', 1)
873 }
874 `$` {
875 if s.is_inside_string {
876 return s.new_token(.str_dollar, '', 1)
877 } else {
878 return s.new_token(.dollar, '', 1)
879 }
880 }
881 `}` {
882 // s = `hello ${name} !`
883 if s.str_helper_tokens.len > 0 {
884 s.str_helper_tokens.delete_last()
885 quote := s.str_quote()
886 if 255 != quote {
887 if s.pos < s.text.len - 1 {
888 s.pos++
889 } else {
890 s.error('unfinished string literal')
891 }
892 if s.text[s.pos] == quote {
893 s.is_inside_string = false
894 s.str_helper_tokens.delete_last()
895 return s.new_token(.string, '', 1)
896 }
897 ident_string := s.ident_string()
898 return s.new_token(.string, ident_string, ident_string.len + 2) // + two quotes
899 }
900 }
901 return s.new_token(.rcbr, '', 1)
902 }
903 `&` {
904 if nextc == `&` {
905 if s.look_ahead(2) == `=` {
906 s.pos += 2
907 return s.new_token(.boolean_and_assign, '', 3)
908 }
909 }
910 if nextc == `=` {
911 s.pos++
912 return s.new_token(.and_assign, '', 2)
913 }
914 afternextc := s.look_ahead(2)
915 if nextc == `&` && (afternextc.is_space() || afternextc == `!`) {
916 s.pos++
917 return s.new_token(.and, '', 2)
918 }
919 return s.new_token(.amp, '', 1)
920 }
921 `|` {
922 if nextc == `|` {
923 if s.look_ahead(2) == `=` {
924 s.pos += 2
925 return s.new_token(.boolean_or_assign, '', 3)
926 }
927 s.pos++
928 return s.new_token(.logical_or, '', 2)
929 }
930 if nextc == `=` {
931 s.pos++
932 return s.new_token(.or_assign, '', 2)
933 }
934 return s.new_token(.pipe, '', 1)
935 }
936 `,` {
937 return s.new_token(.comma, '', 1)
938 }
939 `@` {
940 // @[attr]
941 if s.text[s.pos + 1] == `[` {
942 return s.new_token(.at, '', 1)
943 }
944 mut name := ''
945 if nextc != `\0` {
946 s.pos++
947 name = s.ident_name()
948 }
949 if s.is_fmt {
950 return s.new_token(.name, '@' + name, name.len + 1)
951 }
952 // @FN, @STRUCT, @MOD etc. See full list in token.valid_at_tokens
953 if '@' + name in token.valid_at_tokens || name.starts_with('cc') { // `=@cccond` in inline assembly
954 return s.new_token(.at, '@' + name, name.len + 1)
955 }
956 if !token.is_key(name) {
957 // If name is all uppercase, the user is probably looking for a compile time variable ("at-token")
958 if name.is_upper() {
959 comptime_vars := token.valid_at_tokens.join(', ')
960 s.add_error_detail('available compile time variables: ${comptime_vars}'.wrap(
961 width: 90
962 ))
963 }
964 s.error('@ must be used before keywords or compile time variables (e.g. `@type string` or `@FN`)')
965 } else {
966 // s.note('@keyword is being deprecated and then removed from V. Use `keyword_` or a different name (e.g. `typ` instead of `type`)')
967 }
968 return s.new_token(.name, name, name.len)
969 }
970 `.` {
971 if nextc == `.` {
972 s.pos++
973 if s.pos + 1 < s.text.len && s.text[s.pos + 1] == `.` {
974 s.pos++
975 return s.new_token(.ellipsis, '', 3)
976 }
977 return s.new_token(.dotdot, '', 2)
978 }
979 return s.new_token(.dot, '', 1)
980 }
981 `#` {
982 // manage gated arrays/strings
983 if nextc == `[` {
984 s.pos++
985 return s.new_token(.nilsbr, '', 2)
986 }
987
988 start := s.pos + 1
989 s.ignore_line()
990 if nextc == `!` {
991 // treat shebang line (#!) as a comment
992 comment := s.text[start - 1..s.pos].trim_space()
993 if s.line_nr != 1 {
994 comment_pos := token.Pos{
995 line_nr: s.line_nr - 1
996 len: comment.len
997 pos: start
998 col: u16_col(s.current_column() - comment.len)
999 file_idx: s.file_idx
1000 }
1001 s.error_with_pos('a shebang is only valid at the top of the file',
1002 comment_pos)
1003 }
1004 // s.fgenln('// shebang line "$s.line_comment"')
1005 return s.new_token(.comment, comment, comment.len + 2)
1006 }
1007 hash := s.text[start..s.pos].trim_space()
1008 return s.new_token(.hash, hash, hash.len + 2)
1009 }
1010 `>` {
1011 if nextc == `=` {
1012 s.pos++
1013 return s.new_token(.ge, '', 2)
1014 } else if nextc == `>` {
1015 if s.pos + 2 < s.text.len {
1016 if s.text[s.pos + 2] == `=` {
1017 s.pos += 2
1018 return s.new_token(.right_shift_assign, '', 3)
1019 } else if s.text[s.pos + 2] == `>` {
1020 if s.pos + 3 < s.text.len && s.text[s.pos + 3] == `=` {
1021 s.pos += 3
1022 return s.new_token(.unsigned_right_shift_assign, '', 4)
1023 }
1024 s.pos += 2
1025 return s.new_token(.unsigned_right_shift, '', 3)
1026 }
1027 }
1028 s.pos++
1029 return s.new_token(.right_shift, '', 2)
1030 }
1031 return s.new_token(.gt, '', 1)
1032 }
1033 `<` {
1034 if nextc == `=` {
1035 s.pos++
1036 return s.new_token(.le, '', 2)
1037 } else if nextc == `<` {
1038 if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` {
1039 s.pos += 2
1040 return s.new_token(.left_shift_assign, '', 3)
1041 }
1042 s.pos++
1043 return s.new_token(.left_shift, '', 2)
1044 } else if nextc == `-` {
1045 s.pos++
1046 return s.new_token(.arrow, '', 2)
1047 } else {
1048 s.last_lt = s.pos
1049 return s.new_token(.lt, '', 1)
1050 }
1051 }
1052 `=` {
1053 if nextc == `=` {
1054 s.pos++
1055 return s.new_token(.eq, '', 2)
1056 } else {
1057 return s.new_token(.assign, '', 1)
1058 }
1059 }
1060 `:` {
1061 if nextc == `=` {
1062 s.pos++
1063 return s.new_token(.decl_assign, '', 2)
1064 } else {
1065 return s.new_token(.colon, '', 1)
1066 }
1067 }
1068 `;` {
1069 return s.new_token(.semicolon, '', 1)
1070 }
1071 `!` {
1072 if nextc == `=` {
1073 s.pos++
1074 return s.new_token(.ne, '', 2)
1075 } else if s.text.len > s.pos + 3 && nextc == `i` && s.text[s.pos + 2] == `n`
1076 && s.text[s.pos + 3].is_space() {
1077 s.pos += 2
1078 return s.new_token(.not_in, '', 3)
1079 } else if s.text.len > s.pos + 3 && nextc == `i` && s.text[s.pos + 2] == `s`
1080 && s.text[s.pos + 3].is_space() {
1081 s.pos += 2
1082 return s.new_token(.not_is, '', 3)
1083 } else {
1084 return s.new_token(.not, '!', 1)
1085 }
1086 }
1087 `~` {
1088 return s.new_token(.bit_not, '', 1)
1089 }
1090 `/` {
1091 if nextc == `=` {
1092 s.pos++
1093 return s.new_token(.div_assign, '', 2)
1094 }
1095 if nextc == `/` { // Single line comments
1096 start := s.pos + 1
1097 s.ignore_line()
1098 mut comment_line_end := s.pos
1099 if s.text[s.pos - 1] == b_cr {
1100 comment_line_end--
1101 s.pos--
1102 }
1103 // fix line_nr, \n was read; the comment is marked on the next line
1104 s.pos--
1105 s.line_nr--
1106 if s.should_parse_comment() {
1107 s.line_comment = s.text[start + 1..comment_line_end]
1108 mut comment := s.line_comment
1109 // Find out if this comment is on its own line (for vfmt)
1110 mut is_separate_line_comment := true
1111 for j := start - 2; j >= 0 && s.text[j] != b_lf; j-- {
1112 if s.text[j] !in [`\t`, ` `] {
1113 is_separate_line_comment = false
1114 }
1115 }
1116 if is_separate_line_comment {
1117 // Note: ´\x01´ is used to preserve the initial whitespace in comments
1118 // that are on a separate line
1119 comment = '\x01' + comment
1120 }
1121 return s.new_token(.comment, comment, s.line_comment.len + 2)
1122 }
1123 // Skip the comment (return the next token)
1124 continue
1125 } else if nextc == `*` { // Multiline comments
1126 start := s.pos + 2
1127 start_line := s.line_nr
1128 mut nest_count := 1
1129 s.pos++
1130 // Skip comment
1131 for nest_count > 0 && s.pos < s.text.len - 1 {
1132 s.pos++
1133 if s.pos >= s.text.len - 1 {
1134 s.line_nr = start_line
1135 s.error('unterminated multiline comment')
1136 }
1137 if s.text[s.pos] == b_lf {
1138 s.inc_line_number()
1139 continue
1140 }
1141 if s.expect('/*', s.pos) && s.text[s.pos + 2] != `/` {
1142 nest_count++
1143 continue
1144 }
1145 if s.expect('*/', s.pos) {
1146 nest_count--
1147 }
1148 }
1149 s.pos++
1150 if s.should_parse_comment() {
1151 mut comment := s.text[start..(s.pos - 1)]
1152 if !comment.contains('\n') {
1153 comment_pos := token.Pos{
1154 line_nr: start_line
1155 len: comment.len + 4
1156 pos: start
1157 col: u16_col(s.current_column() - comment.len - 4)
1158 file_idx: s.file_idx
1159 }
1160 if !s.pref.is_fmt {
1161 s.error_with_pos('inline comment is deprecated, please use line comment',
1162 comment_pos)
1163 }
1164 comment = '\x01' + comment.trim(' ')
1165 }
1166 return s.new_multiline_token(.comment, comment, comment.len + 4, start_line)
1167 }
1168 // Skip if not in fmt mode
1169 continue
1170 }
1171 return s.new_token(.div, '', 1)
1172 }
1173 else {}
1174 }
1175
1176 $if windows {
1177 if c == `\0` {
1178 return s.end_of_file()
1179 }
1180 }
1181 s.invalid_character()
1182 break
1183 }
1184 return s.end_of_file()
1185}
1186
1187fn (mut s Scanner) invalid_character() {
1188 len := utf8_char_len(s.text[s.pos])
1189 end := int_min(s.pos + len, s.text.len)
1190 c := s.text[s.pos..end]
1191 s.error('invalid character `${c}`')
1192}
1193
1194@[inline]
1195fn (s &Scanner) current_column() int {
1196 return s.pos - s.last_nl_pos
1197}
1198
1199@[direct_array_access]
1200fn (s &Scanner) count_symbol_before(p int, sym u8) int {
1201 mut count := 0
1202 for i := p; i >= 0; i-- {
1203 if s.text[i] != sym {
1204 break
1205 }
1206 count++
1207 }
1208 return count
1209}
1210
1211// ident_string returns a lexed V string, starting from the current position in the text
1212// it supports r'strings', c'strings', interpolated 'strings' and "strings", and hex
1213// escapes in them (except in the r'strings' where the content is returned verbatim)
1214@[direct_array_access]
1215pub fn (mut s Scanner) ident_string() string {
1216 quote := s.str_quote()
1217 if 255 == quote {
1218 return ''
1219 }
1220 s.quote = quote
1221 // determines if it is a nested string
1222 if s.is_inside_string {
1223 s.is_nested_string = true
1224 } else {
1225 s.is_nested_string = false
1226 }
1227 lspos := token.Pos{
1228 line_nr: s.line_nr
1229 pos: s.pos
1230 col: u16(s.pos - s.last_nl_pos - 1)
1231 file_idx: s.file_idx
1232 }
1233 q := s.text[s.pos]
1234 is_quote := q in [single_quote, double_quote]
1235 is_raw := is_quote && s.pos > 0 && s.text[s.pos - 1] == `r` && !s.is_inside_string
1236 is_cstr := is_quote && s.pos > 0 && s.text[s.pos - 1] == `c` && !s.is_inside_string
1237 mut n_cr_chars := 0
1238 mut start := s.pos
1239 start_char := s.text[start]
1240 if start_char == s.quote {
1241 start++
1242 } else if start_char == b_lf {
1243 s.inc_line_number()
1244 }
1245 s.is_inside_string = false
1246 s.u16_escapes_pos.clear()
1247 s.u32_escapes_pos.clear()
1248 s.h_escapes_pos.clear()
1249 mut backslash_count := if start_char == backslash { 1 } else { 0 }
1250 for {
1251 s.pos++
1252 if s.pos >= s.text.len {
1253 if lspos.line_nr + 1 < s.line_nr {
1254 s.add_error_detail_with_pos('literal started here', lspos)
1255 }
1256 s.error('unfinished string literal')
1257 break
1258 }
1259 c := s.text[s.pos]
1260 prevc := s.text[s.pos - 1]
1261 if c == backslash {
1262 backslash_count++
1263 }
1264 // end of string
1265 if c == s.quote && (is_raw || backslash_count & 1 == 0) {
1266 // handle '123\\' backslash at the end
1267 break
1268 }
1269 if c == b_cr {
1270 n_cr_chars++
1271 }
1272 if c == b_lf {
1273 s.inc_line_number()
1274 }
1275 // Escape `\x` `\u` `\U`
1276 if backslash_count & 1 == 1 && !is_raw && !is_cstr {
1277 // Escape `\x`
1278 if c == `x` {
1279 if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
1280 && s.text[s.pos + 2].is_hex_digit()) {
1281 s.error(r'`\x` used without two following hex digits')
1282 }
1283 s.h_escapes_pos << s.pos - 1
1284 }
1285 // Escape `\u`
1286 if c == `u` {
1287 if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote
1288 || s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
1289 || !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
1290 || !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit() {
1291 s.error(r'`\u` incomplete 16 bit unicode character value')
1292 }
1293 s.u16_escapes_pos << s.pos - 1
1294 }
1295 // Escape `\U`
1296 if c == `U` {
1297 if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote
1298 || s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
1299 || s.text[s.pos + 5] == s.quote || s.text[s.pos + 6] == s.quote
1300 || s.text[s.pos + 7] == s.quote || s.text[s.pos + 8] == s.quote
1301 || !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
1302 || !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit()
1303 || !s.text[s.pos + 5].is_hex_digit() || !s.text[s.pos + 6].is_hex_digit()
1304 || !s.text[s.pos + 7].is_hex_digit() || !s.text[s.pos + 8].is_hex_digit() {
1305 s.error(r'`\U` incomplete 32 bit unicode character value')
1306 }
1307 s.u32_escapes_pos << s.pos - 1
1308 }
1309 // Unknown escape sequence
1310 if !util.is_escape_sequence(c) && !digit_table[c] && c != `\n` {
1311 s.error('`${c.ascii_str()}` unknown escape sequence')
1312 }
1313 }
1314 // ${var} (ignore in vfmt mode) (skip \$)
1315 if prevc == `$` && c == `{` && !is_raw
1316 && s.count_symbol_before(s.pos - 2, backslash) & 1 == 0 {
1317 s.is_inside_string = true
1318 // so that s.pos points to $ at the next step
1319 s.pos -= 2
1320 break
1321 }
1322 if c != backslash {
1323 backslash_count = 0
1324 }
1325 }
1326 mut lit := ''
1327 mut end := s.pos
1328 if s.is_inside_string {
1329 end++
1330 }
1331 if start <= s.pos {
1332 mut string_so_far := s.text[start..end]
1333 if !s.is_fmt {
1334 mut segment_idx := 0
1335 s.str_segments.clear()
1336 if s.u16_escapes_pos.len + s.h_escapes_pos.len + s.u32_escapes_pos.len > 0 {
1337 s.all_pos.clear()
1338 s.all_pos << s.u16_escapes_pos
1339 s.all_pos << s.u32_escapes_pos
1340 s.all_pos << s.h_escapes_pos
1341 s.all_pos.sort()
1342
1343 for pos in s.all_pos {
1344 s.str_segments << string_so_far[segment_idx..(pos - start)]
1345 segment_idx = pos - start
1346 if pos in s.u16_escapes_pos {
1347 decoded := s.decode_u16_escape_single(string_so_far, segment_idx)
1348 s.str_segments << decoded.segment
1349 segment_idx = decoded.idx
1350 }
1351 if pos in s.u32_escapes_pos {
1352 decoded := s.decode_u32_escape_single(string_so_far, segment_idx)
1353 s.str_segments << decoded.segment
1354 segment_idx = decoded.idx
1355 }
1356 if pos in s.h_escapes_pos {
1357 decoded := s.decode_h_escape_single(string_so_far, segment_idx)
1358 s.str_segments << decoded.segment
1359 segment_idx = decoded.idx
1360 }
1361 }
1362 }
1363 if segment_idx < string_so_far.len {
1364 s.str_segments << string_so_far[segment_idx..]
1365 }
1366 string_so_far = s.str_segments.join('')
1367 }
1368
1369 if n_cr_chars > 0 {
1370 string_so_far = string_so_far.replace('\r', '')
1371 }
1372 if !is_raw && string_so_far.contains('\\\n') {
1373 lit = trim_slash_line_break(string_so_far)
1374 } else {
1375 lit = string_so_far
1376 }
1377 }
1378 if s.text[end] == quote {
1379 s.str_helper_tokens.delete_last()
1380 }
1381 return lit
1382}
1383
1384struct DecodedEscape {
1385 idx int
1386 segment string
1387}
1388
1389fn (mut s Scanner) decode_h_escape_single(str string, idx int) DecodedEscape {
1390 end_idx := idx + 4 // "\xXX".len == 4
1391 if idx + 2 > str.len || end_idx > str.len {
1392 s.error_with_pos('unfinished single hex escape started at', s.current_pos())
1393 return DecodedEscape{0, ''}
1394 }
1395 // notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
1396 return DecodedEscape{
1397 idx: end_idx
1398 segment: [u8(strconv.parse_uint(str[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
1399 }
1400}
1401
1402// only handle single-byte inline escapes like '\xc0'
1403fn (mut s Scanner) decode_h_escapes(sinput string, start int, escapes_pos []int) string {
1404 if escapes_pos.len == 0 {
1405 return sinput
1406 }
1407 mut ss := []string{cap: escapes_pos.len * 2 + 1}
1408 ss << sinput[..escapes_pos.first() - start]
1409 for i, pos in escapes_pos {
1410 idx := pos - start
1411 decoded := s.decode_h_escape_single(sinput, idx)
1412 if decoded.idx > sinput.len {
1413 s.error_with_pos('unfinished hex escape started at', s.current_pos())
1414 return ''
1415 }
1416 ss << decoded.segment
1417 if i + 1 < escapes_pos.len {
1418 ss << sinput[decoded.idx..escapes_pos[i + 1] - start]
1419 } else {
1420 ss << sinput[decoded.idx..]
1421 }
1422 }
1423 return ss.join('')
1424}
1425
1426// handle single-byte inline octal escapes like '\###'
1427fn (mut s Scanner) decode_o_escapes(sinput string, start int, escapes_pos []int) string {
1428 if escapes_pos.len == 0 {
1429 return sinput
1430 }
1431 mut ss := []string{cap: escapes_pos.len}
1432 ss << sinput[..escapes_pos.first() - start] // everything before the first escape code position
1433 for i, pos in escapes_pos {
1434 idx := pos - start
1435 end_idx := idx + 4 // "\XXX".len == 4
1436 if end_idx > sinput.len {
1437 s.error_with_pos('unfinished octal escape started at', s.current_pos())
1438 return ''
1439 }
1440 // notice this function doesn't do any decoding... it just replaces '\141' with the byte 0o141
1441 octal_byte := u8(strconv.parse_uint(sinput[idx + 1..end_idx], 8, 8) or { 0 })
1442 ss << [octal_byte].bytestr()
1443 if i + 1 < escapes_pos.len {
1444 ss << sinput[end_idx..escapes_pos[i + 1] - start]
1445 } else {
1446 ss << sinput[end_idx..]
1447 }
1448 }
1449 return ss.join('')
1450}
1451
1452fn (mut s Scanner) decode_u16_escape_single(str string, idx int) DecodedEscape {
1453 end_idx := idx + 6 // "\uXXXX".len == 6
1454 if idx + 2 > str.len || end_idx > str.len {
1455 s.error_with_pos('unfinished u16 escape started at', s.current_pos())
1456 return DecodedEscape{0, ''}
1457 }
1458 escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
1459 // Check if Escaped Code Point is invalid or not
1460 if rune(escaped_code_point).length_in_bytes() == -1 {
1461 s.error('invalid unicode point `${str}`')
1462 }
1463 return DecodedEscape{end_idx, utf32_to_str(u32(escaped_code_point))}
1464}
1465
1466// decode a single 16 bit unicode escaped rune into its utf-8 bytes
1467fn (mut s Scanner) decode_u16erune(str string) string {
1468 decoded := s.decode_u16_escape_single(str, 0)
1469 if str.len == decoded.idx {
1470 return decoded.segment
1471 }
1472 mut ss := []string{cap: 2}
1473 ss << decoded.segment
1474 ss << str[decoded.idx..]
1475 return ss.join('')
1476}
1477
1478fn (mut s Scanner) decode_u32_escape_single(str string, idx int) DecodedEscape {
1479 end_idx := idx + 10 // "\uXXXXXXXX".len == 10
1480 if idx + 2 > str.len || end_idx > str.len {
1481 s.error_with_pos('unfinished u32 escape started at', s.current_pos())
1482 return DecodedEscape{0, ''}
1483 }
1484 escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
1485 // Check if Escaped Code Point is invalid or not
1486 if rune(escaped_code_point).length_in_bytes() == -1 {
1487 s.error('invalid unicode point `${str}`')
1488 }
1489 return DecodedEscape{end_idx, utf32_to_str(u32(escaped_code_point))}
1490}
1491
1492// decode a single 32 bit unicode escaped rune into its utf-8 bytes
1493fn (mut s Scanner) decode_u32erune(str string) string {
1494 decoded := s.decode_u32_escape_single(str, 0)
1495 if str.len == decoded.idx {
1496 return decoded.segment
1497 }
1498 mut ss := []string{cap: 2}
1499 ss << decoded.segment
1500 ss << str[decoded.idx..]
1501 return ss.join('')
1502}
1503
1504fn trim_slash_line_break(s string) string {
1505 mut start := 0
1506 mut ret_str := s
1507 for {
1508 // find the position of the first `\` followed by a newline, after `start`:
1509 idx := ret_str.index_after('\\\n', start) or { break }
1510 start = idx
1511 // Here, ret_str[idx] is \, and ret_str[idx+1] is newline.
1512 // Depending on the number of backslashes before the newline, we should either
1513 // treat the last one and the whitespace after it as line-break, or just ignore it:
1514 mut nbackslashes := 0
1515 for eidx := idx; eidx >= 0 && ret_str[eidx] == `\\`; eidx-- {
1516 nbackslashes++
1517 }
1518 // eprintln('>> start: ${start:-5} | nbackslashes: ${nbackslashes:-5} | ret_str: $ret_str')
1519 if idx == 0 || (nbackslashes & 1) == 1 {
1520 ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
1521 } else {
1522 // ensure the loop will terminate, when we could not strip anything:
1523 start++
1524 }
1525 }
1526 return ret_str
1527}
1528
1529/// ident_char is called when a backtick "single-char" is parsed from the code
1530/// it is needed because some runes (chars) are written with escape sequences
1531/// the string it returns should be a standardized, simplified version of the character
1532/// as it would appear in source code
1533/// possibilities:
1534/// single chars like `a`, `b` => 'a', 'b'
1535/// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
1536/// escaped single hex bytes like `\x01`, `\x61` => '\x01', 'a'
1537/// escaped unicode literals like `\u2605`
1538/// escaped unicode 32 literals like `\U00002605`
1539/// escaped utf8 runes in hex like `\xe2\x98\x85` => (★)
1540/// escaped utf8 runes in octal like `\342\230\205` => (★)
1541pub fn (mut s Scanner) ident_char() string {
1542 lspos := token.Pos{
1543 line_nr: s.line_nr
1544 pos: s.pos
1545 col: u16(s.pos - s.last_nl_pos - 1)
1546 file_idx: s.file_idx
1547 }
1548
1549 start := s.pos // the string position of the first backtick char
1550 slash := `\\`
1551 mut len := 0
1552
1553 // set flags for advanced escapes first
1554 escaped_hex := s.expect('\\x', start + 1) && s.text.len > start + 3
1555 && s.text[start + 3].is_hex_digit()
1556 escaped_unicode_16 := s.expect('\\u', start + 1) && s.text.len > start + 3
1557 && s.text[start + 3].is_hex_digit()
1558 escaped_unicode_32 := s.expect('\\U', start + 1) && s.text.len > start + 3
1559 && s.text[start + 3].is_hex_digit()
1560 escaped_octal := !escaped_hex && !escaped_unicode_16 && !escaped_unicode_32
1561 && s.expect('\\', start + 1) && s.text.len > start + 2 && s.text[start + 2].is_oct_digit()
1562
1563 // walk the string to get characters up to the next backtick
1564 for {
1565 s.pos++
1566 if s.pos >= s.text.len {
1567 break
1568 }
1569 if s.text[s.pos] != slash {
1570 len++
1571 }
1572 double_slash := s.expect('\\\\', s.pos - 2)
1573 if s.text[s.pos] == `\`` && (s.text[s.pos - 1] != slash || double_slash) {
1574 // ` // apostrophe balance comment. do not remove
1575 if double_slash {
1576 len++
1577 }
1578 break
1579 }
1580 }
1581 len--
1582 mut c := s.text[start + 1..s.pos]
1583 if s.is_fmt {
1584 return c
1585 }
1586 if len != 1 {
1587 // the string inside the backticks is longer than one character
1588 // but we might only have one rune... attempt to decode escapes
1589 // if the content expresses an escape code, it will have an even number of characters
1590 // e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605
1591 // we don't handle binary escape codes in rune literals
1592 orig := c
1593 if c.len & 1 == 0
1594 && (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) {
1595 if escaped_unicode_16 {
1596 // there can only be one, so attempt to decode it now
1597 c = s.decode_u16erune(c)
1598 } else if escaped_unicode_32 {
1599 // there can only be one, so attempt to decode it now
1600 c = s.decode_u32erune(c)
1601 } else {
1602 // find escape sequence start positions
1603 mut escapes_pos := []int{}
1604 for i, v in c {
1605 if v == `\\` {
1606 escapes_pos << i
1607 }
1608 }
1609 if escaped_hex {
1610 c = s.decode_h_escapes(c, 0, escapes_pos)
1611 } else {
1612 c = s.decode_o_escapes(c, 0, escapes_pos)
1613 }
1614 }
1615 }
1616
1617 u := c.runes()
1618 if u.len != 1 {
1619 mut err_info := []string{cap: u.len}
1620 mut i := 0
1621 for i < u.len {
1622 if u[i] != `\\` || i == u.len - 1 {
1623 err_info << '`${u[i]}`'
1624 i++
1625 continue
1626 }
1627 err_info << '`\\${u[i + 1]}`'
1628 i += 2
1629 }
1630 if escaped_hex || escaped_unicode_16 || escaped_unicode_32 {
1631 s.error_with_pos('invalid character literal `${orig}` => `${c}` ([${err_info.join(', ')}]) (escape sequence did not refer to a singular rune)',
1632 lspos)
1633 } else if u.len == 0 {
1634 s.add_error_detail('use quotes for strings, backticks for characters')
1635 s.error_with_pos('invalid empty character literal `${orig}`', lspos)
1636 } else {
1637 s.add_error_detail('use quotes for strings, backticks for characters')
1638 s.error_with_pos('invalid character literal `${orig}` => `${c}` ([${err_info.join(', ')}]) (more than one character)',
1639 lspos)
1640 }
1641 }
1642 } else if c.ends_with('\n') {
1643 s.add_error_detail('use quotes for strings, backticks for characters')
1644 s.error_with_pos('invalid character literal, use \`\\n\` instead', lspos)
1645 } else if c.len > len {
1646 ch := c[c.len - 1]
1647 if !util.is_escape_sequence(ch) && !digit_table[ch] {
1648 s.error('`${ch.ascii_str()}` unknown escape sequence')
1649 }
1650 }
1651 // Escapes a `'` character
1652 if c == "'" {
1653 return '\\' + c
1654 }
1655 return c
1656}
1657
1658@[direct_array_access; inline]
1659fn (s &Scanner) expect(want string, start_pos int) bool {
1660 end_pos := start_pos + want.len
1661 if start_pos < 0 || end_pos < 0 || start_pos >= s.text.len || end_pos > s.text.len {
1662 return false
1663 }
1664 for pos in start_pos .. end_pos {
1665 if s.text[pos] != want[pos - start_pos] {
1666 return false
1667 }
1668 }
1669 return true
1670}
1671
1672@[inline]
1673fn (mut s Scanner) ignore_line() {
1674 s.eat_to_end_of_line()
1675 s.inc_line_number()
1676}
1677
1678@[direct_array_access; inline]
1679fn (mut s Scanner) eat_to_end_of_line() {
1680 for s.pos < s.text.len && s.text[s.pos] != b_lf {
1681 s.pos++
1682 }
1683}
1684
1685@[direct_array_access; inline]
1686fn (mut s Scanner) inc_line_number() {
1687 s.last_nl_pos = if s.text.len - 1 > s.pos { s.pos } else { s.text.len - 1 }
1688 s.line_nr++
1689 if s.line_nr > s.nr_lines {
1690 s.nr_lines = s.line_nr
1691 }
1692}
1693
1694pub fn (mut s Scanner) current_pos() token.Pos {
1695 return token.Pos{
1696 line_nr: s.line_nr
1697 pos: s.pos
1698 col: u16_col(s.current_column() - 1)
1699 file_idx: s.file_idx
1700 }
1701}
1702
1703pub fn (mut s Scanner) note(msg string) {
1704 if s.pref.notes_are_errors {
1705 s.error_with_pos(msg, s.current_pos())
1706 return
1707 }
1708 pos := token.Pos{
1709 line_nr: s.line_nr
1710 pos: s.pos
1711 file_idx: s.file_idx
1712 }
1713 if s.pref.output_mode == .stdout && !s.pref.check_only {
1714 util.show_compiler_message('notice:', pos: pos, file_path: s.file_path, message: msg)
1715 } else {
1716 s.notices << errors.Notice{
1717 file_path: s.file_path
1718 pos: pos
1719 reporter: .scanner
1720 message: msg
1721 }
1722 }
1723}
1724
1725// call this *before* calling error or warn
1726pub fn (mut s Scanner) add_error_detail(msg string) {
1727 s.error_details << msg
1728}
1729
1730pub fn (mut s Scanner) add_error_detail_with_pos(msg string, pos token.Pos) {
1731 s.add_error_detail('\n' + util.formatted_error('details:', msg, s.file_path, pos))
1732}
1733
1734fn (mut s Scanner) eat_details() string {
1735 mut details := ''
1736 if s.error_details.len > 0 {
1737 details = s.error_details.join('\n')
1738 s.error_details = []
1739 }
1740 return details
1741}
1742
1743pub fn (mut s Scanner) warn(msg string) {
1744 s.warn_with_pos(msg, s.current_pos())
1745}
1746
1747pub fn (mut s Scanner) warn_with_pos(msg string, pos token.Pos) {
1748 if s.pref.warns_are_errors {
1749 s.error_with_pos(msg, pos)
1750 return
1751 }
1752 details := s.eat_details()
1753 if s.pref.output_mode == .stdout && !s.pref.check_only {
1754 util.show_compiler_message('warning:',
1755 pos: pos
1756 file_path: s.file_path
1757 message: msg
1758 details: details
1759 )
1760 } else {
1761 if s.pref.message_limit >= 0 && s.warnings.len >= s.pref.message_limit {
1762 s.should_abort = true
1763 return
1764 }
1765 s.warnings << errors.Warning{
1766 file_path: s.file_path
1767 pos: pos
1768 reporter: .scanner
1769 message: msg
1770 details: details
1771 }
1772 }
1773}
1774
1775pub fn (mut s Scanner) error(msg string) {
1776 s.error_with_pos(msg, s.current_pos())
1777}
1778
1779pub fn (mut s Scanner) error_with_pos(msg string, pos token.Pos) {
1780 details := s.eat_details()
1781 if s.pref.output_mode == .stdout && !s.pref.check_only {
1782 util.show_compiler_message('error:',
1783 pos: pos
1784 file_path: s.file_path
1785 message: msg
1786 details: details
1787 )
1788 exit(1)
1789 } else {
1790 if s.pref.fatal_errors {
1791 util.show_compiler_message('error:',
1792 pos: pos
1793 file_path: s.file_path
1794 message: msg
1795 details: details
1796 )
1797 exit(1)
1798 }
1799 if s.pref.message_limit >= 0 && s.errors.len >= s.pref.message_limit {
1800 s.should_abort = true
1801 return
1802 }
1803 s.errors << errors.Error{
1804 file_path: s.file_path
1805 pos: pos
1806 reporter: .scanner
1807 message: msg
1808 details: details
1809 }
1810 }
1811}
1812
1813fn (mut s Scanner) trace[T](fbase string, x &T) {
1814 if s.file_base == fbase {
1815 println('> s.trace | ${fbase:-10s} | ${voidptr(x):16} | ${x}')
1816 }
1817}
1818
1819// prepare_for_new_text resets the internal state of the scanner,
1820// so that it can be reused for scanning the new text, given by `text`,
1821// using a subsequent s.scan_text() call, to get the token corresponding to the text.
1822pub fn (mut s Scanner) prepare_for_new_text(text string) {
1823 s.text = text
1824 s.pos = -1
1825 s.tidx = 0
1826 s.all_tokens.clear()
1827 s.errors.clear()
1828 s.error_details.clear()
1829 s.warnings.clear()
1830 s.notices.clear()
1831 s.str_helper_tokens.clear()
1832 s.str_segments.clear()
1833 s.all_pos.clear()
1834 s.u16_escapes_pos.clear()
1835 s.u32_escapes_pos.clear()
1836 s.h_escapes_pos.clear()
1837 s.should_abort = false
1838 s.eofs = 0
1839 s.nr_lines = 0
1840 s.line_nr = 0
1841 s.last_nl_pos = -1
1842 s.is_inside_toplvl_statement = false
1843 s.is_inside_string = false
1844 s.is_nested_string = false
1845 s.last_lt = -1
1846 s.quote = 0
1847}
1848
1849// new_silent_scanner returns a new scanner instance, setup to just set internal flags and append errors
1850// to its .errors field, *without aborting the program*. It is mainly useful for programs that want to
1851// lex potentially invalid V source code repeatedly, and do their own error handling (checking .errors.len).
1852pub fn new_silent_scanner() &Scanner {
1853 mut p := pref.new_preferences()
1854 p.output_mode = .silent
1855 return &Scanner{
1856 pref: p
1857 }
1858}
1859
1860@[direct_array_access]
1861fn (s Scanner) str_quote() u8 {
1862 if s.str_helper_tokens.len == 0 {
1863 return 255
1864 }
1865 c := s.str_helper_tokens[s.str_helper_tokens.len - 1]
1866 if c in [`'`, `"`] {
1867 return c
1868 }
1869 return 255
1870}
1871
1872@[direct_array_access; inline]
1873fn (s &Scanner) is_likely_unclosed_string_interpolation(current_quote u8) bool {
1874 if current_quote != s.quote || s.str_helper_tokens.len == 0 || s.str_quote() != 255
1875 || s.all_tokens.len == 0 {
1876 return false
1877 }
1878 prev_tok := s.all_tokens[s.all_tokens.len - 1]
1879 return prev_tok.kind in [.number, .string, .chartoken, .rpar, .rsbr, .rcbr]
1880}
1881
1882@[inline]
1883fn u16_col(col int) u16 {
1884 return if col < 0 { u16(0) } else { u16(col) }
1885}
1886