v / vlib / v2 / scanner / scanner.v
632 lines · 619 sloc · 12.31 KB · 726559c029ba7e9171e3f7cb5dcfd204ee42f7e3
Raw
1// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module scanner
5
6import v2.token
7import v2.pref
8
9@[flag]
10pub enum Mode {
11 normal
12 scan_comments
13 skip_interpolation
14}
15
16pub struct Scanner {
17 pref &pref.Preferences
18 mode Mode
19 skip_interpolation bool
20mut:
21 file &token.File = &token.File{}
22 insert_semi bool
23pub mut:
24 src string
25 offset int // current char offset
26 pos int // token offset (start of current token)
27 lit string
28 // strings literals & interpolation
29 in_str_incomplete bool
30 in_str_inter bool
31 str_inter_cbr_depth int
32 str_quote u8
33}
34
35pub fn new_scanner(prefs &pref.Preferences, mode Mode) &Scanner {
36 unsafe {
37 return &Scanner{
38 pref: prefs
39 mode: mode
40 skip_interpolation: mode.has(.skip_interpolation)
41 }
42 }
43}
44
45pub fn (mut s Scanner) init(file &token.File, src string) {
46 // reset since scanner instance may be reused
47 s.offset = 0
48 s.pos = 0
49 s.lit = ''
50 s.insert_semi = false
51 s.in_str_incomplete = false
52 s.in_str_inter = false
53 s.str_inter_cbr_depth = 0
54 s.str_quote = 0
55 // init
56 s.file = unsafe { file }
57 s.src = src
58}
59
60fn (mut s Scanner) scan_char_literal(quote u8) token.Token {
61 s.offset++
62 for s.offset < s.src.len {
63 c2 := s.src[s.offset]
64 if c2 == quote {
65 break
66 }
67 if c2 == `\\` && s.offset + 1 < s.src.len {
68 s.offset += 2
69 continue
70 }
71 s.offset++
72 }
73 mut end := s.offset
74 if s.offset < s.src.len && s.src[s.offset] == quote {
75 end = s.offset
76 s.offset++
77 }
78 s.lit = s.src[s.pos + 1..end]
79 s.insert_semi = true
80 return .char
81}
82
83// current_file returns the scanner's current source file handle.
84pub fn (s &Scanner) current_file() &token.File {
85 return unsafe { s.file }
86}
87
88@[direct_array_access]
89pub fn (mut s Scanner) scan() token.Token {
90 // integrity check: detect source buffer corruption
91 // before whitespace call to keep whitespaces in string
92 // NOTE: before start: simply for a little more efficiency
93 // if !s.skip_interpolation && s.in_str_incomplete {
94 if s.in_str_incomplete {
95 s.in_str_incomplete = false
96 s.pos = s.offset
97 s.string_literal(false, s.str_quote)
98 s.lit = s.src[s.pos..s.offset]
99 return .string
100 }
101 start:
102 s.whitespace()
103 if s.offset == s.src.len {
104 s.lit = ''
105 if s.insert_semi {
106 s.insert_semi = false
107 return .semicolon
108 }
109 s.file.add_line(s.offset)
110 return .eof
111 }
112 c := s.src[s.offset]
113 s.pos = s.offset
114 preserve_insert_semi := s.insert_semi
115 s.insert_semi = false
116 if c == `\n` {
117 s.lit = ''
118 return .semicolon
119 }
120 // comment | `/=` | `/`
121 else if c == `/` {
122 c2 := s.src[s.offset + 1]
123 // comment
124 if c2 in [`/`, `*`] {
125 if preserve_insert_semi {
126 s.insert_semi = true
127 }
128 s.comment()
129 if !s.mode.has(.scan_comments) {
130 unsafe {
131 goto start
132 }
133 }
134 s.lit = s.src[s.pos..s.offset]
135 return .comment
136 }
137 // `/=`
138 else if c2 == `=` {
139 s.offset += 2
140 return .div_assign
141 }
142 s.offset++
143 // `/`
144 return .div
145 }
146 // number
147 else if c >= `0` && c <= `9` {
148 s.number()
149 s.lit = s.src[s.pos..s.offset]
150 s.insert_semi = true
151 return .number
152 }
153 // keyword | name
154 else if (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) || c in [`_`, `@`] {
155 s.offset++
156 // NOTE: I have made `@[` a token instead of using `@` and `[` because `@`
157 // is not currently used as a token, and it is also easier to parse this way.
158 // if/when `@` becomes used as a token of its own, then I may change this.
159 if c == `@` && s.src[s.offset] == `[` {
160 s.offset++
161 return .attribute
162 }
163 for s.offset < s.src.len {
164 c3 := s.src[s.offset]
165 if c3.is_alnum() || c3 == `_` {
166 s.offset++
167 continue
168 }
169 break
170 }
171 s.lit = s.src[s.pos..s.offset]
172 tok := token.Token.from_string_tinyv(s.lit)
173 if tok in [.key_break, .key_continue, .key_none, .key_return, .key_false, .key_true, .name] {
174 s.insert_semi = true
175 }
176 return tok
177 }
178 // string
179 else if c in [`'`, `"`] {
180 s.offset++
181 if !s.in_str_inter {
182 s.str_quote = c
183 }
184 // TODO: I would prefer a better way to handle raw
185 s.string_literal(s.in_str_inter || s.src[s.offset - 2] == `r`, c)
186 s.lit = s.src[s.pos..s.offset]
187 s.insert_semi = true
188 return .string
189 }
190 // byte (char) `a`
191 else if c == `\`` {
192 return s.scan_char_literal(c)
193 }
194 // s.lit not set, as tokens below get converted directly to string
195 // s.lit = c
196 s.lit = ''
197 s.offset++
198 match c {
199 `.` {
200 c2 := s.src[s.offset]
201 if c2 >= `0` && c2 <= `9` {
202 // TODO: only really need decimal
203 s.number()
204 s.lit = s.src[s.pos..s.offset]
205 return .number
206 } else if c2 == `.` {
207 s.offset++
208 if s.src[s.offset] == `.` {
209 s.offset++
210 return .ellipsis
211 }
212 return .dotdot
213 }
214 return .dot
215 }
216 `:` {
217 if s.src[s.offset] == `=` {
218 s.offset++
219 return .decl_assign
220 }
221 return .colon
222 }
223 `!` {
224 c2 := s.src[s.offset]
225 if c2 == `=` {
226 s.offset++
227 return .ne
228 } else if c2 == `i` {
229 c3 := s.src[s.offset + 1]
230 c4_is_space := s.src[s.offset + 2] in [` `, `\t`]
231 if c3 == `n` && c4_is_space {
232 s.offset += 2
233 return .not_in
234 } else if c3 == `s` && c4_is_space {
235 s.offset += 2
236 return .not_is
237 }
238 }
239 s.insert_semi = true
240 return .not
241 }
242 `=` {
243 c2 := s.src[s.offset]
244 if c2 == `=` {
245 s.offset++
246 return .eq
247 }
248 return .assign
249 }
250 `+` {
251 c2 := s.src[s.offset]
252 if c2 == `+` {
253 s.offset++
254 return .inc
255 } else if c2 == `=` {
256 s.offset++
257 return .plus_assign
258 }
259 return .plus
260 }
261 `-` {
262 c2 := s.src[s.offset]
263 if c2 == `-` {
264 s.offset++
265 return .dec
266 } else if c2 == `=` {
267 s.offset++
268 return .minus_assign
269 }
270 return .minus
271 }
272 `%` {
273 if s.src[s.offset] == `=` {
274 s.offset++
275 return .mod_assign
276 }
277 return .mod
278 }
279 `*` {
280 if s.src[s.offset] == `=` {
281 s.offset++
282 return .mul_assign
283 }
284 return .mul
285 }
286 `^` {
287 if s.src[s.offset] == `=` {
288 s.offset++
289 return .xor_assign
290 }
291 return .xor
292 }
293 `&` {
294 c2 := s.src[s.offset]
295 if c2 == `&` {
296 // Parse logical and assignment as bitwise-and assignment token for now.
297 // It is later lowered by transformer/type-aware stages as needed.
298 if s.offset + 1 < s.src.len && s.src[s.offset + 1] == `=` {
299 s.offset += 2
300 return .and_assign
301 }
302 // so that we parse &&Type as two .amp instead of .and
303 // but this requires there is a space. we could check
304 // for capital or some other way, this is simplest for now.
305 if s.offset + 1 <= s.src.len && s.src[s.offset + 1] in [` `, `\t`] {
306 s.offset++
307 return .and
308 }
309 } else if c2 == `=` {
310 s.offset++
311 return .and_assign
312 }
313 return .amp
314 }
315 `|` {
316 c2 := s.src[s.offset]
317 if c2 == `|` {
318 // Parse logical or assignment as bitwise-or assignment token for now.
319 if s.offset + 1 < s.src.len && s.src[s.offset + 1] == `=` {
320 s.offset += 2
321 return .or_assign
322 }
323 s.offset++
324 return .logical_or
325 } else if c2 == `=` {
326 s.offset++
327 return .or_assign
328 }
329 return .pipe
330 }
331 `<` {
332 c2 := s.src[s.offset]
333 if c2 == `<` {
334 s.offset++
335 if s.src[s.offset] == `=` {
336 s.offset++
337 return .left_shift_assign
338 }
339 return .left_shift
340 } else if c2 == `=` {
341 s.offset++
342 return .le
343 } else if c2 == `-` {
344 s.offset++
345 return .arrow
346 }
347 return .lt
348 }
349 `>` {
350 c2 := s.src[s.offset]
351 if c2 == `>` {
352 s.offset++
353 c3 := s.src[s.offset]
354 if c3 == `>` {
355 s.offset++
356 if s.src[s.offset] == `=` {
357 s.offset++
358 return .right_shift_unsigned_assign
359 }
360 return .right_shift_unsigned
361 } else if c3 == `=` {
362 s.offset++
363 return .right_shift_assign
364 }
365 return .right_shift
366 } else if c2 == `=` {
367 s.offset++
368 return .ge
369 }
370 return .gt
371 }
372 `#` {
373 // if we choose to scan whole line
374 // s.line()
375 return .hash
376 }
377 // `@` { return .at }
378 `~` {
379 return .bit_not
380 }
381 `,` {
382 return .comma
383 }
384 `$` {
385 if s.in_str_inter {
386 return .str_dollar
387 }
388 return .dollar
389 }
390 `{` {
391 if s.in_str_inter {
392 s.str_inter_cbr_depth++
393 }
394 return .lcbr
395 }
396 `}` {
397 if s.in_str_inter {
398 s.str_inter_cbr_depth--
399 if s.str_inter_cbr_depth == 0 {
400 s.in_str_incomplete = true
401 s.in_str_inter = false
402 }
403 }
404 s.insert_semi = true
405 return .rcbr
406 }
407 `(` {
408 return .lpar
409 }
410 `)` {
411 s.insert_semi = true
412 return .rpar
413 }
414 `[` {
415 return .lsbr
416 }
417 `]` {
418 s.insert_semi = true
419 return .rsbr
420 }
421 `;` {
422 return .semicolon
423 }
424 `?` {
425 s.insert_semi = true
426 return .question
427 }
428 else {
429 return .unknown
430 }
431 }
432}
433
434// skip whitespace
435@[direct_array_access]
436fn (mut s Scanner) whitespace() {
437 for s.offset < s.src.len {
438 c := s.src[s.offset]
439 if c in [` `, `\t`, `\r`] {
440 s.offset++
441 continue
442 } else if c == `\n` {
443 if s.insert_semi {
444 return
445 }
446 s.offset++
447 s.file.add_line(s.offset)
448 continue
449 }
450 break
451 }
452 // s.insert_semi = false
453}
454
455@[direct_array_access]
456fn (mut s Scanner) line() {
457 // a newline reached here will get recorded by next whitespace call
458 // we could add them manually here, but whitespace is called anyway
459 for s.offset < s.src.len {
460 if s.src[s.offset] == `\n` {
461 break
462 }
463 s.offset++
464 }
465}
466
467@[direct_array_access]
468fn (mut s Scanner) comment() {
469 s.offset++
470 c := s.src[s.offset]
471 // single line
472 if c == `/` {
473 s.line()
474 }
475 // multi line
476 else if c == `*` {
477 s.offset++
478 mut ml_comment_depth := 1
479 for s.offset < s.src.len {
480 c2 := s.src[s.offset]
481 c3 := s.src[s.offset + 1]
482 if c2 == `\n` {
483 s.offset++
484 s.file.add_line(s.offset)
485 } else if c2 == `/` && c3 == `*` {
486 s.offset += 2
487 ml_comment_depth++
488 } else if c2 == `*` && c3 == `/` {
489 s.offset += 2
490 ml_comment_depth--
491 if ml_comment_depth == 0 {
492 break
493 }
494 } else {
495 s.offset++
496 }
497 }
498 }
499}
500
501@[direct_array_access]
502fn (mut s Scanner) string_literal(scan_as_raw bool, c_quote u8) {
503 // shortcut, scan whole string
504 if scan_as_raw {
505 for s.offset < s.src.len {
506 c := s.src[s.offset]
507 if c == c_quote {
508 break
509 }
510 if c == `\n` {
511 s.offset++
512 s.file.add_line(s.offset)
513 continue
514 }
515 s.offset++
516 }
517 if s.offset < s.src.len {
518 s.offset++
519 }
520 return
521 }
522 // normal strings
523 for s.offset < s.src.len {
524 c := s.src[s.offset]
525 // escape `\\n` | `\'`
526 if c == `\\` {
527 s.offset += 2
528 continue
529 } else if c == `\n` {
530 s.offset++
531 s.file.add_line(s.offset)
532 continue
533 } else if c == `$` && s.src[s.offset + 1] == `{` {
534 s.in_str_inter = true
535 if s.skip_interpolation {
536 s.str_inter_cbr_depth++
537 s.offset += 2
538 continue
539 } else {
540 return
541 }
542 } else if s.skip_interpolation && s.in_str_inter {
543 if c == `{` {
544 s.str_inter_cbr_depth++
545 } else if c == `}` {
546 s.str_inter_cbr_depth--
547 if s.str_inter_cbr_depth == 0 {
548 s.in_str_inter = false
549 }
550 }
551 } else if c == c_quote && !s.in_str_inter {
552 s.offset++
553 break
554 }
555 s.offset++
556 }
557}
558
559@[direct_array_access]
560fn (mut s Scanner) number() {
561 if s.src[s.offset] == `0` {
562 s.offset++
563 c := s.src[s.offset]
564 // TODO: impl proper underscore support
565 // 0b (binary)
566 if c in [`b`, `B`] {
567 s.offset++
568 for {
569 c2 := s.src[s.offset]
570 if c2 in [`0`, `1`] || c2 == `_` {
571 s.offset++
572 continue
573 }
574 return
575 }
576 }
577 // 0x (hex)
578 else if c in [`x`, `X`] {
579 s.offset++
580 for {
581 c2 := s.src[s.offset]
582 if (c2 >= `0` && c2 <= `9`) || (c2 >= `a` && c2 <= `f`)
583 || (c2 >= `A` && c2 <= `F`) || c2 == `_` {
584 s.offset++
585 continue
586 }
587 return
588 }
589 }
590 // 0o (octal)
591 else if c in [`o`, `O`] {
592 s.offset++
593 for {
594 c2 := s.src[s.offset]
595 if (c2 >= `0` && c2 <= `7`) || c2 == `_` {
596 s.offset++
597 continue
598 }
599 return
600 }
601 }
602 }
603 mut has_decimal := false
604 mut has_exponent := false
605 // TODO: proper impl of fraction / exponent
606 // continue decimal (and also completion of bin/octal)
607 for s.offset < s.src.len {
608 c := s.src[s.offset]
609 if (c >= `0` && c <= `9`) || c == `_` {
610 s.offset++
611 continue
612 }
613 // fraction (only if next char after '.' is a digit, not a letter like '.hex()')
614 else if !has_decimal && c == `.` && s.src[s.offset + 1] != `.` && s.src[s.offset + 1] >= `0`
615 && s.src[s.offset + 1] <= `9` {
616 has_decimal = true
617 s.offset++
618 continue
619 }
620 // exponent
621 else if !has_exponent && c in [`e`, `E`] {
622 has_exponent = true
623 s.offset++
624 // consume optional sign after exponent
625 if s.offset < s.src.len && s.src[s.offset] in [`+`, `-`] {
626 s.offset++
627 }
628 continue
629 }
630 break
631 }
632}
633