v2 / vlib / x / json2 / scanner.v
690 lines · 648 sloc · 16.89 KB · e2e5cf8db56f3562c7baa735061690be936bdf3e
Raw
1// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module json2
5
6import io
7import strconv
8
9// JsonScanError describes a tokenization error reported by the iterative scanner APIs.
10pub struct JsonScanError {
11 Error
12pub:
13 message string
14
15 line int
16 character int
17}
18
19fn (e JsonScanError) msg() string {
20 return '${e.line}:${e.character}: Invalid json token: ${e.message}'
21}
22
23// Scanner tokenizes JSON from an in-memory string or byte slice.
24pub struct Scanner {
25mut:
26 text []u8
27 pos int // the position of the token in scanner text
28 line int = 1
29 col int = 1
30}
31
32// ReaderScanner tokenizes JSON incrementally from any io.Reader.
33pub struct ReaderScanner {
34mut:
35 reader &io.BufferedReader
36 peeked bool
37 ch u8
38 line int = 1
39 col int = 1
40}
41
42// ReaderScannerConfig configures a reader-backed JSON scanner.
43@[params]
44pub struct ReaderScannerConfig {
45pub:
46 reader io.Reader
47 buffer_size int = 128 * 1024
48}
49
50// TokenKind identifies the kind of a JSON token.
51pub enum TokenKind {
52 none
53 error
54 str
55 float
56 int
57 null
58 bool
59 eof
60 comma = 44 // ,
61 colon = 58 // :
62 lsbr = 91 // [
63 rsbr = 93 // ]
64 lcbr = 123 // {
65 rcbr = 125 // }
66}
67
68// new_scanner creates an iterative scanner for an in-memory JSON string.
69pub fn new_scanner(text string) Scanner {
70 return Scanner{
71 text: text.bytes()
72 line: 1
73 col: 1
74 }
75}
76
77// new_scanner_from_bytes creates an iterative scanner for an in-memory JSON byte slice.
78pub fn new_scanner_from_bytes(text []u8) Scanner {
79 return Scanner{
80 text: text
81 line: 1
82 col: 1
83 }
84}
85
86// new_reader_scanner creates an iterative scanner that reads JSON tokens from an io.Reader.
87pub fn new_reader_scanner(config ReaderScannerConfig) &ReaderScanner {
88 return &ReaderScanner{
89 reader: io.new_buffered_reader(reader: config.reader, cap: config.buffer_size)
90 line: 1
91 col: 1
92 }
93}
94
95// free releases the reader scanner's internal buffer.
96pub fn (mut s ReaderScanner) free() {
97 s.reader.free()
98}
99
100pub struct Token {
101pub:
102 lit []u8 // literal representation of the token
103 kind TokenKind // the token number/enum; for quick comparisons
104 line int // the line in the source where the token occurred
105 col int // the column in the source where the token occurred
106}
107
108// literal returns the token contents as a string.
109pub fn (t Token) literal() string {
110 return t.lit.bytestr()
111}
112
113// full_col returns the full column information which includes the length.
114pub fn (t Token) full_col() int {
115 return t.col + t.lit.len
116}
117
118// is_eof reports whether the token marks the end of the JSON stream.
119pub fn (t Token) is_eof() bool {
120 return t.kind == .eof
121}
122
123// list of characters commonly used in JSON.
124const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]!
125// list of newlines to check when moving to a new position.
126const newlines = [`\r`, `\n`, `\t`]!
127// list of escapable that needs to be escaped inside a JSON string.
128// double quotes and forward slashes are excluded intentionally since
129// they have their own separate checks for it in order to pass the
130// JSON test suite (https://github.com/nst/JSONTestSuite/).
131const important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]!
132// list of valid unicode escapes aside from \u{4-hex digits}
133const valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]!
134// used for transforming escapes into valid unicode (eg. n => \n)
135const unicode_transform_escapes = {
136 98: `\b`
137 102: `\f`
138 110: `\n`
139 114: `\r`
140 116: `\t`
141 92: `\\`
142 34: `"`
143 47: `/`
144}
145const exp_signs = [u8(`-`), `+`]!
146
147fn new_scan_error(message string, line int, col int) JsonScanError {
148 return JsonScanError{
149 message: message
150 line: line
151 character: col
152 }
153}
154
155fn token_to_scan_error(token Token) JsonScanError {
156 return new_scan_error(token.literal(), token.line, token.col)
157}
158
159fn important_escapable_char(ch u8) ?u8 {
160 return match ch {
161 `\b` { `b` }
162 `\f` { `f` }
163 `\n` { `n` }
164 `\r` { `r` }
165 `\t` { `t` }
166 else { none }
167 }
168}
169
170fn invalid_token_description(ch u8) string {
171 if ch >= 32 && ch <= 126 {
172 x := ch.ascii_str()
173 return 'invalid token `${x}`'
174 } else {
175 x := ch.str_escaped()
176 return 'invalid token `${x}`'
177 }
178}
179
180// move_pos proceeds to the next position.
181fn (mut s Scanner) move() {
182 s.move_pos(true, true)
183}
184
185// move_pos_with_newlines is the same as move_pos but only enables newline checking.
186fn (mut s Scanner) move_pos_with_newlines() {
187 s.move_pos(false, true)
188}
189
190fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
191 s.pos++
192 if s.pos < s.text.len {
193 if include_newlines && s.text[s.pos] in newlines {
194 s.line++
195 s.col = 0
196 if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
197 s.pos++
198 }
199 for s.pos < s.text.len && s.text[s.pos] in newlines {
200 s.move()
201 }
202 } else if include_space && s.text[s.pos] == ` ` {
203 s.pos++
204 s.col++
205 for s.pos < s.text.len && s.text[s.pos] == ` ` {
206 s.move()
207 }
208 }
209 } else {
210 s.col++
211 }
212}
213
214// error returns an error token.
215fn (s &Scanner) error(description string) Token {
216 return s.tokenize(description.bytes(), .error)
217}
218
219// tokenize returns a token based on the given lit and kind.
220fn (s &Scanner) tokenize(lit []u8, kind TokenKind) Token {
221 return Token{
222 lit: lit
223 kind: kind
224 col: s.col
225 line: s.line
226 }
227}
228
229// text_scan scans and returns a string token.
230@[manualfree]
231fn (mut s Scanner) text_scan() Token {
232 mut has_closed := false
233 mut chrs := []u8{}
234 for {
235 s.pos++
236 s.col++
237 if s.pos >= s.text.len {
238 break
239 }
240 ch := s.text[s.pos]
241 if ch == `"` {
242 has_closed = true
243 break
244 } else if escaped := important_escapable_char(ch) {
245 return s.error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}')
246 } else if ch < 0x20 {
247 return s.error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}')
248 } else if ch == `\\` {
249 if s.pos == s.text.len - 1 {
250 return s.error('incomplete backslash escape at end of JSON input')
251 }
252
253 peek := s.text[s.pos + 1]
254 if peek in valid_unicode_escapes {
255 chrs << unicode_transform_escapes[int(peek)]
256 s.pos++
257 s.col++
258 continue
259 } else if peek == `u` {
260 if s.pos + 5 < s.text.len {
261 s.pos++
262 s.col++
263 mut codepoint := []u8{}
264 codepoint_start := s.pos
265 for s.pos < s.text.len && s.pos < codepoint_start + 4 {
266 s.pos++
267 s.col++
268 if s.text[s.pos] == `"` {
269 break
270 } else if !s.text[s.pos].is_hex_digit() {
271 x := s.text[s.pos].ascii_str()
272 return s.error('`${x}` is not a hex digit')
273 }
274 codepoint << s.text[s.pos]
275 }
276 if codepoint.len != 4 {
277 return s.error('unicode escape must have 4 hex digits')
278 }
279 val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
280 converted := utf32_to_str(val)
281 converted_bytes := converted.bytes()
282 chrs << converted_bytes
283 unsafe {
284 converted.free()
285 converted_bytes.free()
286 codepoint.free()
287 }
288 continue
289 } else {
290 return s.error('incomplete unicode escape')
291 }
292 } else if peek == `U` {
293 return s.error('unicode endpoints must be in lowercase `u`')
294 } else if peek == u8(229) {
295 return s.error('unicode endpoint not allowed')
296 } else {
297 return s.error('invalid backslash escape')
298 }
299 }
300 chrs << ch
301 }
302 tok := s.tokenize(chrs, .str)
303 s.move()
304 if !has_closed {
305 return s.error('missing double quotes in string closing')
306 }
307 return tok
308}
309
310// num_scan scans and returns an int/float token.
311fn (mut s Scanner) num_scan() Token {
312 // analyze json number structure
313 // -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
314 mut is_fl := false
315 mut dot_index := -1
316 mut digits := []u8{}
317 if s.text[s.pos] == `-` {
318 digits << `-`
319 if s.pos + 1 >= s.text.len || !s.text[s.pos + 1].is_digit() {
320 return s.invalid_token()
321 }
322 s.move_pos_with_newlines()
323 }
324 if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
325 return s.error('leading zeroes in a number are not allowed')
326 }
327 for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
328 digits << s.text[s.pos]
329 if s.text[s.pos] == `.` {
330 is_fl = true
331 dot_index = digits.len - 1
332 }
333 s.move_pos_with_newlines()
334 }
335 if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
336 return s.error('invalid float')
337 }
338 if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
339 digits << s.text[s.pos]
340 s.move_pos_with_newlines()
341 if s.pos < s.text.len && s.text[s.pos] in exp_signs {
342 digits << s.text[s.pos]
343 s.move_pos_with_newlines()
344 }
345 mut exp_digits_count := 0
346 for s.pos < s.text.len && s.text[s.pos].is_digit() {
347 digits << s.text[s.pos]
348 exp_digits_count++
349 s.move_pos_with_newlines()
350 }
351 if exp_digits_count == 0 {
352 return s.error('invalid exponent')
353 }
354 }
355 kind := if is_fl { TokenKind.float } else { TokenKind.int }
356 return s.tokenize(digits, kind)
357}
358
359// invalid_token returns an error token with the invalid token message.
360fn (s &Scanner) invalid_token() Token {
361 return s.error(invalid_token_description(s.text[s.pos]))
362}
363
364// next returns the next JSON token from the in-memory scanner.
365pub fn (mut s Scanner) next() !Token {
366 tok := s.scan()
367 if tok.kind == .error {
368 return token_to_scan_error(tok)
369 }
370 return tok
371}
372
373// scan returns a token based on the scanner's current position.
374// used to set the next token
375@[manualfree]
376fn (mut s Scanner) scan() Token {
377 if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in newlines) {
378 s.move()
379 }
380 if s.pos >= s.text.len {
381 return s.tokenize([]u8{}, .eof)
382 } else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
383 ident := s.text[s.pos..s.pos + 4].bytestr()
384 if ident == 'true' || ident == 'null' {
385 mut kind := TokenKind.null
386 if ident == 'true' {
387 kind = .bool
388 }
389 unsafe { ident.free() }
390 val := s.text[s.pos..s.pos + 4]
391 tok := s.tokenize(val, kind)
392 s.move() // n / t
393 s.move() // u / r
394 s.move() // l / u
395 s.move() // l / e
396 return tok
397 }
398 unsafe { ident.free() }
399 return s.invalid_token()
400 } else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
401 ident := s.text[s.pos..s.pos + 5].bytestr()
402 if ident == 'false' {
403 unsafe { ident.free() }
404 val := s.text[s.pos..s.pos + 5]
405 tok := s.tokenize(val, .bool)
406 s.move() // f
407 s.move() // a
408 s.move() // l
409 s.move() // s
410 s.move() // e
411 return tok
412 }
413 unsafe { ident.free() }
414 return s.invalid_token()
415 } else if s.text[s.pos] in char_list {
416 chr := s.text[s.pos]
417 tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) })
418 s.move()
419 return tok
420 } else if s.text[s.pos] == `"` {
421 return s.text_scan()
422 } else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
423 return s.num_scan()
424 } else {
425 return s.invalid_token()
426 }
427}
428
429fn (mut s ReaderScanner) tokenize(lit []u8, kind TokenKind, line int, col int) Token {
430 return Token{
431 lit: lit
432 kind: kind
433 line: line
434 col: col
435 }
436}
437
438fn (mut s ReaderScanner) has_next_byte() !bool {
439 if s.peeked {
440 return true
441 }
442 mut buf := [u8(0)]
443 n := s.reader.read(mut buf) or {
444 if err is io.Eof {
445 return false
446 }
447 return err
448 }
449 if n == 0 {
450 return false
451 }
452 s.ch = buf[0]
453 s.peeked = true
454 return true
455}
456
457fn (mut s ReaderScanner) peek_byte() !u8 {
458 if !s.has_next_byte()! {
459 return io.Eof{}
460 }
461 return s.ch
462}
463
464fn (mut s ReaderScanner) advance_position(ch u8) ! {
465 if ch == `\r` {
466 if s.has_next_byte()! && s.ch == `\n` {
467 s.peeked = false
468 }
469 }
470 if ch in newlines {
471 s.line++
472 s.col = 1
473 return
474 }
475 s.col++
476}
477
478fn (mut s ReaderScanner) read_byte() !u8 {
479 ch := s.peek_byte()!
480 s.peeked = false
481 s.advance_position(ch)!
482 return ch
483}
484
485fn (mut s ReaderScanner) skip_whitespace() ! {
486 for {
487 if !s.has_next_byte()! {
488 return
489 }
490 ch := s.ch
491 if ch == ` ` || ch in newlines {
492 _ = s.read_byte()!
493 continue
494 }
495 return
496 }
497}
498
499fn (mut s ReaderScanner) scan_ident(ident string, kind TokenKind, line int, col int) !Token {
500 mut lit := []u8{}
501 for expected in ident.bytes() {
502 current_line, current_col := s.line, s.col
503 ch := s.read_byte() or {
504 if err is io.Eof {
505 return new_scan_error('unexpected end of JSON input', current_line, current_col)
506 }
507 return err
508 }
509 if ch != expected {
510 return new_scan_error(invalid_token_description(ch), current_line, current_col)
511 }
512 lit << ch
513 }
514 return s.tokenize(lit, kind, line, col)
515}
516
517@[manualfree]
518fn (mut s ReaderScanner) text_scan(line int, col int) !Token {
519 mut chrs := []u8{}
520 _ = s.read_byte()! // opening quote
521 for {
522 current_line, current_col := s.line, s.col
523 if !s.has_next_byte()! {
524 return new_scan_error('missing double quotes in string closing', line, col)
525 }
526 ch := s.ch
527 if ch == `"` {
528 _ = s.read_byte()!
529 break
530 } else if escaped := important_escapable_char(ch) {
531 return new_scan_error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}',
532 current_line, current_col)
533 } else if ch < 0x20 {
534 return new_scan_error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}',
535 current_line, current_col)
536 } else if ch == `\\` {
537 _ = s.read_byte()!
538 escape_line, escape_col := s.line, s.col
539 if !s.has_next_byte()! {
540 return new_scan_error('incomplete backslash escape at end of JSON input',
541 escape_line, escape_col)
542 }
543 peek := s.ch
544 if peek in valid_unicode_escapes {
545 chrs << unicode_transform_escapes[int(peek)]
546 _ = s.read_byte()!
547 continue
548 } else if peek == `u` {
549 _ = s.read_byte()!
550 mut codepoint := []u8{}
551 for _ in 0 .. 4 {
552 digit_line, digit_col := s.line, s.col
553 if !s.has_next_byte()! {
554 return new_scan_error('incomplete unicode escape', escape_line, escape_col)
555 }
556 digit := s.ch
557 if digit == `"` {
558 return new_scan_error('unicode escape must have 4 hex digits', digit_line,
559 digit_col)
560 } else if !digit.is_hex_digit() {
561 x := digit.ascii_str()
562 return new_scan_error('`${x}` is not a hex digit', digit_line, digit_col)
563 }
564 codepoint << digit
565 _ = s.read_byte()!
566 }
567 val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
568 converted := utf32_to_str(val)
569 converted_bytes := converted.bytes()
570 chrs << converted_bytes
571 unsafe {
572 converted.free()
573 converted_bytes.free()
574 codepoint.free()
575 }
576 continue
577 } else if peek == `U` {
578 return new_scan_error('unicode endpoints must be in lowercase `u`', escape_line,
579 escape_col)
580 } else if peek == u8(229) {
581 return new_scan_error('unicode endpoint not allowed', escape_line, escape_col)
582 } else {
583 return new_scan_error('invalid backslash escape', escape_line, escape_col)
584 }
585 }
586 chrs << ch
587 _ = s.read_byte()!
588 }
589 return s.tokenize(chrs, .str, line, col)
590}
591
592fn (mut s ReaderScanner) num_scan(line int, col int) !Token {
593 mut is_fl := false
594 mut dot_index := -1
595 mut digits := []u8{}
596 if s.peek_byte()! == `-` {
597 digits << `-`
598 _ = s.read_byte()!
599 if !s.has_next_byte()! {
600 return new_scan_error('invalid token `-`', line, col)
601 }
602 next := s.ch
603 if !next.is_digit() {
604 return new_scan_error(invalid_token_description(next), s.line, s.col)
605 }
606 }
607 if s.has_next_byte()! {
608 first := s.ch
609 if first == `0` {
610 digits << first
611 _ = s.read_byte()!
612 if s.has_next_byte()! && s.ch.is_digit() {
613 return new_scan_error('leading zeroes in a number are not allowed', line, col)
614 }
615 }
616 }
617 for {
618 if !s.has_next_byte()! {
619 break
620 }
621 ch := s.ch
622 if ch.is_digit() || (!is_fl && ch == `.`) {
623 digits << ch
624 if ch == `.` {
625 is_fl = true
626 dot_index = digits.len - 1
627 }
628 _ = s.read_byte()!
629 continue
630 }
631 break
632 }
633 if dot_index != -1 && digits[dot_index + 1..].len == 0 {
634 return new_scan_error('invalid float', line, col)
635 }
636 if s.has_next_byte()! {
637 ch := s.ch
638 if ch == `e` || ch == `E` {
639 digits << ch
640 _ = s.read_byte()!
641 if s.has_next_byte()! && s.ch in exp_signs {
642 digits << s.ch
643 _ = s.read_byte()!
644 }
645 mut exp_digits_count := 0
646 for {
647 if !s.has_next_byte()! {
648 break
649 }
650 digit := s.ch
651 if !digit.is_digit() {
652 break
653 }
654 digits << digit
655 exp_digits_count++
656 _ = s.read_byte()!
657 }
658 if exp_digits_count == 0 {
659 return new_scan_error('invalid exponent', line, col)
660 }
661 }
662 }
663 kind := if is_fl { TokenKind.float } else { TokenKind.int }
664 return s.tokenize(digits, kind, line, col)
665}
666
667// next returns the next JSON token from the reader-backed scanner.
668pub fn (mut s ReaderScanner) next() !Token {
669 s.skip_whitespace()!
670 line, col := s.line, s.col
671 if !s.has_next_byte()! {
672 return s.tokenize([]u8{}, .eof, line, col)
673 }
674 ch := s.ch
675 if ch == `t` || ch == `n` {
676 ident := if ch == `t` { 'true' } else { 'null' }
677 kind := if ch == `t` { TokenKind.bool } else { TokenKind.null }
678 return s.scan_ident(ident, kind, line, col)
679 } else if ch == `f` {
680 return s.scan_ident('false', .bool, line, col)
681 } else if ch in char_list {
682 _ = s.read_byte()!
683 return s.tokenize([]u8{}, unsafe { TokenKind(int(ch)) }, line, col)
684 } else if ch == `"` {
685 return s.text_scan(line, col)
686 } else if ch.is_digit() || ch == `-` {
687 return s.num_scan(line, col)
688 }
689 return new_scan_error(invalid_token_description(ch), line, col)
690}
691