Gitly


1 // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module json2
5 
6 import io
7 import strconv
8 
9 // JsonScanError describes a tokenization error reported by the iterative scanner APIs.
10 pub struct JsonScanError {
11     Error
12 pub:
13     message string
14 
15     line      int
16     character int
17 }
18 
19 fn (e JsonScanError) msg() string {
20     return '${e.line}:${e.character}: Invalid json token: ${e.message}'
21 }
22 
23 // Scanner tokenizes JSON from an in-memory string or byte slice.
24 pub struct Scanner {
25 mut:
26     text []u8
27     pos  int // the position of the token in scanner text
28     line int = 1
29     col  int = 1
30 }
31 
32 // ReaderScanner tokenizes JSON incrementally from any io.Reader.
33 pub struct ReaderScanner {
34 mut:
35     reader &io.BufferedReader
36     peeked bool
37     ch     u8
38     line   int = 1
39     col    int = 1
40 }
41 
42 // ReaderScannerConfig configures a reader-backed JSON scanner.
43 @[params]
44 pub struct ReaderScannerConfig {
45 pub:
46     reader      io.Reader
47     buffer_size int = 128 * 1024
48 }
49 
50 // TokenKind identifies the kind of a JSON token.
51 pub enum TokenKind {
52     none
53     error
54     str
55     float
56     int
57     null
58     bool
59     eof
60     comma = 44  // ,
61     colon = 58  // :
62     lsbr  = 91  // [
63     rsbr  = 93  // ]
64     lcbr  = 123 // {
65     rcbr  = 125 // }
66 }
67 
68 // new_scanner creates an iterative scanner for an in-memory JSON string.
69 pub fn new_scanner(text string) Scanner {
70     return Scanner{
71         text: text.bytes()
72         line: 1
73         col:  1
74     }
75 }
76 
77 // new_scanner_from_bytes creates an iterative scanner for an in-memory JSON byte slice.
78 pub fn new_scanner_from_bytes(text []u8) Scanner {
79     return Scanner{
80         text: text
81         line: 1
82         col:  1
83     }
84 }
85 
86 // new_reader_scanner creates an iterative scanner that reads JSON tokens from an io.Reader.
87 pub fn new_reader_scanner(config ReaderScannerConfig) &ReaderScanner {
88     return &ReaderScanner{
89         reader: io.new_buffered_reader(reader: config.reader, cap: config.buffer_size)
90         line:   1
91         col:    1
92     }
93 }
94 
95 // free releases the reader scanner's internal buffer.
96 pub fn (mut s ReaderScanner) free() {
97     s.reader.free()
98 }
99 
100 pub struct Token {
101 pub:
102     lit  []u8      // literal representation of the token
103     kind TokenKind // the token number/enum; for quick comparisons
104     line int       // the line in the source where the token occurred
105     col  int       // the column in the source where the token occurred
106 }
107 
108 // literal returns the token contents as a string.
109 pub fn (t Token) literal() string {
110     return t.lit.bytestr()
111 }
112 
113 // full_col returns the full column information which includes the length.
114 pub fn (t Token) full_col() int {
115     return t.col + t.lit.len
116 }
117 
118 // is_eof reports whether the token marks the end of the JSON stream.
119 pub fn (t Token) is_eof() bool {
120     return t.kind == .eof
121 }
122 
123 // list of characters commonly used in JSON.
124 const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]!
125 // list of newlines to check when moving to a new position.
126 const newlines = [`\r`, `\n`, `\t`]!
127 // list of escapable that needs to be escaped inside a JSON string.
128 // double quotes and forward slashes are excluded intentionally since
129 // they have their own separate checks for it in order to pass the
130 // JSON test suite (https://github.com/nst/JSONTestSuite/).
131 const important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]!
132 // list of valid unicode escapes aside from \u{4-hex digits}
133 const valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]!
134 // used for transforming escapes into valid unicode (eg. n => \n)
135 const unicode_transform_escapes = {
136     98:  `\b`
137     102: `\f`
138     110: `\n`
139     114: `\r`
140     116: `\t`
141     92:  `\\`
142     34:  `"`
143     47:  `/`
144 }
145 const exp_signs = [u8(`-`), `+`]!
146 
147 fn new_scan_error(message string, line int, col int) JsonScanError {
148     return JsonScanError{
149         message:   message
150         line:      line
151         character: col
152     }
153 }
154 
155 fn token_to_scan_error(token Token) JsonScanError {
156     return new_scan_error(token.literal(), token.line, token.col)
157 }
158 
159 fn important_escapable_char(ch u8) ?u8 {
160     return match ch {
161         `\b` { `b` }
162         `\f` { `f` }
163         `\n` { `n` }
164         `\r` { `r` }
165         `\t` { `t` }
166         else { none }
167     }
168 }
169 
170 fn invalid_token_description(ch u8) string {
171     if ch >= 32 && ch <= 126 {
172         x := ch.ascii_str()
173         return 'invalid token `${x}`'
174     } else {
175         x := ch.str_escaped()
176         return 'invalid token `${x}`'
177     }
178 }
179 
180 // move_pos proceeds to the next position.
181 fn (mut s Scanner) move() {
182     s.move_pos(true, true)
183 }
184 
185 // move_pos_with_newlines is the same as move_pos but only enables newline checking.
186 fn (mut s Scanner) move_pos_with_newlines() {
187     s.move_pos(false, true)
188 }
189 
190 fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
191     s.pos++
192     if s.pos < s.text.len {
193         if include_newlines && s.text[s.pos] in newlines {
194             s.line++
195             s.col = 0
196             if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
197                 s.pos++
198             }
199             for s.pos < s.text.len && s.text[s.pos] in newlines {
200                 s.move()
201             }
202         } else if include_space && s.text[s.pos] == ` ` {
203             s.pos++
204             s.col++
205             for s.pos < s.text.len && s.text[s.pos] == ` ` {
206                 s.move()
207             }
208         }
209     } else {
210         s.col++
211     }
212 }
213 
214 // error returns an error token.
215 fn (s &Scanner) error(description string) Token {
216     return s.tokenize(description.bytes(), .error)
217 }
218 
219 // tokenize returns a token based on the given lit and kind.
220 fn (s &Scanner) tokenize(lit []u8, kind TokenKind) Token {
221     return Token{
222         lit:  lit
223         kind: kind
224         col:  s.col
225         line: s.line
226     }
227 }
228 
229 // text_scan scans and returns a string token.
230 @[manualfree]
231 fn (mut s Scanner) text_scan() Token {
232     mut has_closed := false
233     mut chrs := []u8{}
234     for {
235         s.pos++
236         s.col++
237         if s.pos >= s.text.len {
238             break
239         }
240         ch := s.text[s.pos]
241         if ch == `"` {
242             has_closed = true
243             break
244         } else if escaped := important_escapable_char(ch) {
245             return s.error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}')
246         } else if ch < 0x20 {
247             return s.error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}')
248         } else if ch == `\\` {
249             if s.pos == s.text.len - 1 {
250                 return s.error('incomplete backslash escape at end of JSON input')
251             }
252 
253             peek := s.text[s.pos + 1]
254             if peek in valid_unicode_escapes {
255                 chrs << unicode_transform_escapes[int(peek)]
256                 s.pos++
257                 s.col++
258                 continue
259             } else if peek == `u` {
260                 if s.pos + 5 < s.text.len {
261                     s.pos++
262                     s.col++
263                     mut codepoint := []u8{}
264                     codepoint_start := s.pos
265                     for s.pos < s.text.len && s.pos < codepoint_start + 4 {
266                         s.pos++
267                         s.col++
268                         if s.text[s.pos] == `"` {
269                             break
270                         } else if !s.text[s.pos].is_hex_digit() {
271                             x := s.text[s.pos].ascii_str()
272                             return s.error('`${x}` is not a hex digit')
273                         }
274                         codepoint << s.text[s.pos]
275                     }
276                     if codepoint.len != 4 {
277                         return s.error('unicode escape must have 4 hex digits')
278                     }
279                     val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
280                     converted := utf32_to_str(val)
281                     converted_bytes := converted.bytes()
282                     chrs << converted_bytes
283                     unsafe {
284                         converted.free()
285                         converted_bytes.free()
286                         codepoint.free()
287                     }
288                     continue
289                 } else {
290                     return s.error('incomplete unicode escape')
291                 }
292             } else if peek == `U` {
293                 return s.error('unicode endpoints must be in lowercase `u`')
294             } else if peek == u8(229) {
295                 return s.error('unicode endpoint not allowed')
296             } else {
297                 return s.error('invalid backslash escape')
298             }
299         }
300         chrs << ch
301     }
302     tok := s.tokenize(chrs, .str)
303     s.move()
304     if !has_closed {
305         return s.error('missing double quotes in string closing')
306     }
307     return tok
308 }
309 
310 // num_scan scans and returns an int/float token.
311 fn (mut s Scanner) num_scan() Token {
312     // analyze json number structure
313     // -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
314     mut is_fl := false
315     mut dot_index := -1
316     mut digits := []u8{}
317     if s.text[s.pos] == `-` {
318         digits << `-`
319         if s.pos + 1 >= s.text.len || !s.text[s.pos + 1].is_digit() {
320             return s.invalid_token()
321         }
322         s.move_pos_with_newlines()
323     }
324     if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
325         return s.error('leading zeroes in a number are not allowed')
326     }
327     for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
328         digits << s.text[s.pos]
329         if s.text[s.pos] == `.` {
330             is_fl = true
331             dot_index = digits.len - 1
332         }
333         s.move_pos_with_newlines()
334     }
335     if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
336         return s.error('invalid float')
337     }
338     if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
339         digits << s.text[s.pos]
340         s.move_pos_with_newlines()
341         if s.pos < s.text.len && s.text[s.pos] in exp_signs {
342             digits << s.text[s.pos]
343             s.move_pos_with_newlines()
344         }
345         mut exp_digits_count := 0
346         for s.pos < s.text.len && s.text[s.pos].is_digit() {
347             digits << s.text[s.pos]
348             exp_digits_count++
349             s.move_pos_with_newlines()
350         }
351         if exp_digits_count == 0 {
352             return s.error('invalid exponent')
353         }
354     }
355     kind := if is_fl { TokenKind.float } else { TokenKind.int }
356     return s.tokenize(digits, kind)
357 }
358 
359 // invalid_token returns an error token with the invalid token message.
360 fn (s &Scanner) invalid_token() Token {
361     return s.error(invalid_token_description(s.text[s.pos]))
362 }
363 
364 // next returns the next JSON token from the in-memory scanner.
365 pub fn (mut s Scanner) next() !Token {
366     tok := s.scan()
367     if tok.kind == .error {
368         return token_to_scan_error(tok)
369     }
370     return tok
371 }
372 
373 // scan returns a token based on the scanner's current position.
374 // used to set the next token
375 @[manualfree]
376 fn (mut s Scanner) scan() Token {
377     if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in newlines) {
378         s.move()
379     }
380     if s.pos >= s.text.len {
381         return s.tokenize([]u8{}, .eof)
382     } else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
383         ident := s.text[s.pos..s.pos + 4].bytestr()
384         if ident == 'true' || ident == 'null' {
385             mut kind := TokenKind.null
386             if ident == 'true' {
387                 kind = .bool
388             }
389             unsafe { ident.free() }
390             val := s.text[s.pos..s.pos + 4]
391             tok := s.tokenize(val, kind)
392             s.move() // n / t
393             s.move() // u / r
394             s.move() // l / u
395             s.move() // l / e
396             return tok
397         }
398         unsafe { ident.free() }
399         return s.invalid_token()
400     } else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
401         ident := s.text[s.pos..s.pos + 5].bytestr()
402         if ident == 'false' {
403             unsafe { ident.free() }
404             val := s.text[s.pos..s.pos + 5]
405             tok := s.tokenize(val, .bool)
406             s.move() // f
407             s.move() // a
408             s.move() // l
409             s.move() // s
410             s.move() // e
411             return tok
412         }
413         unsafe { ident.free() }
414         return s.invalid_token()
415     } else if s.text[s.pos] in char_list {
416         chr := s.text[s.pos]
417         tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) })
418         s.move()
419         return tok
420     } else if s.text[s.pos] == `"` {
421         return s.text_scan()
422     } else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
423         return s.num_scan()
424     } else {
425         return s.invalid_token()
426     }
427 }
428 
429 fn (mut s ReaderScanner) tokenize(lit []u8, kind TokenKind, line int, col int) Token {
430     return Token{
431         lit:  lit
432         kind: kind
433         line: line
434         col:  col
435     }
436 }
437 
438 fn (mut s ReaderScanner) has_next_byte() !bool {
439     if s.peeked {
440         return true
441     }
442     mut buf := [u8(0)]
443     n := s.reader.read(mut buf) or {
444         if err is io.Eof {
445             return false
446         }
447         return err
448     }
449     if n == 0 {
450         return false
451     }
452     s.ch = buf[0]
453     s.peeked = true
454     return true
455 }
456 
457 fn (mut s ReaderScanner) peek_byte() !u8 {
458     if !s.has_next_byte()! {
459         return io.Eof{}
460     }
461     return s.ch
462 }
463 
464 fn (mut s ReaderScanner) advance_position(ch u8) ! {
465     if ch == `\r` {
466         if s.has_next_byte()! && s.ch == `\n` {
467             s.peeked = false
468         }
469     }
470     if ch in newlines {
471         s.line++
472         s.col = 1
473         return
474     }
475     s.col++
476 }
477 
478 fn (mut s ReaderScanner) read_byte() !u8 {
479     ch := s.peek_byte()!
480     s.peeked = false
481     s.advance_position(ch)!
482     return ch
483 }
484 
485 fn (mut s ReaderScanner) skip_whitespace() ! {
486     for {
487         if !s.has_next_byte()! {
488             return
489         }
490         ch := s.ch
491         if ch == ` ` || ch in newlines {
492             _ = s.read_byte()!
493             continue
494         }
495         return
496     }
497 }
498 
499 fn (mut s ReaderScanner) scan_ident(ident string, kind TokenKind, line int, col int) !Token {
500     mut lit := []u8{}
501     for expected in ident.bytes() {
502         current_line, current_col := s.line, s.col
503         ch := s.read_byte() or {
504             if err is io.Eof {
505                 return new_scan_error('unexpected end of JSON input', current_line, current_col)
506             }
507             return err
508         }
509         if ch != expected {
510             return new_scan_error(invalid_token_description(ch), current_line, current_col)
511         }
512         lit << ch
513     }
514     return s.tokenize(lit, kind, line, col)
515 }
516 
517 @[manualfree]
518 fn (mut s ReaderScanner) text_scan(line int, col int) !Token {
519     mut chrs := []u8{}
520     _ = s.read_byte()! // opening quote
521     for {
522         current_line, current_col := s.line, s.col
523         if !s.has_next_byte()! {
524             return new_scan_error('missing double quotes in string closing', line, col)
525         }
526         ch := s.ch
527         if ch == `"` {
528             _ = s.read_byte()!
529             break
530         } else if escaped := important_escapable_char(ch) {
531             return new_scan_error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}',
532                 current_line, current_col)
533         } else if ch < 0x20 {
534             return new_scan_error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}',
535                 current_line, current_col)
536         } else if ch == `\\` {
537             _ = s.read_byte()!
538             escape_line, escape_col := s.line, s.col
539             if !s.has_next_byte()! {
540                 return new_scan_error('incomplete backslash escape at end of JSON input',
541                     escape_line, escape_col)
542             }
543             peek := s.ch
544             if peek in valid_unicode_escapes {
545                 chrs << unicode_transform_escapes[int(peek)]
546                 _ = s.read_byte()!
547                 continue
548             } else if peek == `u` {
549                 _ = s.read_byte()!
550                 mut codepoint := []u8{}
551                 for _ in 0 .. 4 {
552                     digit_line, digit_col := s.line, s.col
553                     if !s.has_next_byte()! {
554                         return new_scan_error('incomplete unicode escape', escape_line, escape_col)
555                     }
556                     digit := s.ch
557                     if digit == `"` {
558                         return new_scan_error('unicode escape must have 4 hex digits', digit_line,
559                             digit_col)
560                     } else if !digit.is_hex_digit() {
561                         x := digit.ascii_str()
562                         return new_scan_error('`${x}` is not a hex digit', digit_line, digit_col)
563                     }
564                     codepoint << digit
565                     _ = s.read_byte()!
566                 }
567                 val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
568                 converted := utf32_to_str(val)
569                 converted_bytes := converted.bytes()
570                 chrs << converted_bytes
571                 unsafe {
572                     converted.free()
573                     converted_bytes.free()
574                     codepoint.free()
575                 }
576                 continue
577             } else if peek == `U` {
578                 return new_scan_error('unicode endpoints must be in lowercase `u`', escape_line,
579                     escape_col)
580             } else if peek == u8(229) {
581                 return new_scan_error('unicode endpoint not allowed', escape_line, escape_col)
582             } else {
583                 return new_scan_error('invalid backslash escape', escape_line, escape_col)
584             }
585         }
586         chrs << ch
587         _ = s.read_byte()!
588     }
589     return s.tokenize(chrs, .str, line, col)
590 }
591 
592 fn (mut s ReaderScanner) num_scan(line int, col int) !Token {
593     mut is_fl := false
594     mut dot_index := -1
595     mut digits := []u8{}
596     if s.peek_byte()! == `-` {
597         digits << `-`
598         _ = s.read_byte()!
599         if !s.has_next_byte()! {
600             return new_scan_error('invalid token `-`', line, col)
601         }
602         next := s.ch
603         if !next.is_digit() {
604             return new_scan_error(invalid_token_description(next), s.line, s.col)
605         }
606     }
607     if s.has_next_byte()! {
608         first := s.ch
609         if first == `0` {
610             digits << first
611             _ = s.read_byte()!
612             if s.has_next_byte()! && s.ch.is_digit() {
613                 return new_scan_error('leading zeroes in a number are not allowed', line, col)
614             }
615         }
616     }
617     for {
618         if !s.has_next_byte()! {
619             break
620         }
621         ch := s.ch
622         if ch.is_digit() || (!is_fl && ch == `.`) {
623             digits << ch
624             if ch == `.` {
625                 is_fl = true
626                 dot_index = digits.len - 1
627             }
628             _ = s.read_byte()!
629             continue
630         }
631         break
632     }
633     if dot_index != -1 && digits[dot_index + 1..].len == 0 {
634         return new_scan_error('invalid float', line, col)
635     }
636     if s.has_next_byte()! {
637         ch := s.ch
638         if ch == `e` || ch == `E` {
639             digits << ch
640             _ = s.read_byte()!
641             if s.has_next_byte()! && s.ch in exp_signs {
642                 digits << s.ch
643                 _ = s.read_byte()!
644             }
645             mut exp_digits_count := 0
646             for {
647                 if !s.has_next_byte()! {
648                     break
649                 }
650                 digit := s.ch
651                 if !digit.is_digit() {
652                     break
653                 }
654                 digits << digit
655                 exp_digits_count++
656                 _ = s.read_byte()!
657             }
658             if exp_digits_count == 0 {
659                 return new_scan_error('invalid exponent', line, col)
660             }
661         }
662     }
663     kind := if is_fl { TokenKind.float } else { TokenKind.int }
664     return s.tokenize(digits, kind, line, col)
665 }
666 
667 // next returns the next JSON token from the reader-backed scanner.
668 pub fn (mut s ReaderScanner) next() !Token {
669     s.skip_whitespace()!
670     line, col := s.line, s.col
671     if !s.has_next_byte()! {
672         return s.tokenize([]u8{}, .eof, line, col)
673     }
674     ch := s.ch
675     if ch == `t` || ch == `n` {
676         ident := if ch == `t` { 'true' } else { 'null' }
677         kind := if ch == `t` { TokenKind.bool } else { TokenKind.null }
678         return s.scan_ident(ident, kind, line, col)
679     } else if ch == `f` {
680         return s.scan_ident('false', .bool, line, col)
681     } else if ch in char_list {
682         _ = s.read_byte()!
683         return s.tokenize([]u8{}, unsafe { TokenKind(int(ch)) }, line, col)
684     } else if ch == `"` {
685         return s.text_scan(line, col)
686     } else if ch.is_digit() || ch == `-` {
687         return s.num_scan(line, col)
688     }
689     return new_scan_error(invalid_token_description(ch), line, col)
690 }
691

1	// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2	// Use of this source code is governed by an MIT license
3	// that can be found in the LICENSE file.
4	module json2
5
6	import io
7	import strconv
8
9	// JsonScanError describes a tokenization error reported by the iterative scanner APIs.
10	pub struct JsonScanError {
11	Error
12	pub:
13	message string
14
15	line int
16	character int
17	}
18
19	fn (e JsonScanError) msg() string {
20	return '${e.line}:${e.character}: Invalid json token: ${e.message}'
21	}
22
23	// Scanner tokenizes JSON from an in-memory string or byte slice.
24	pub struct Scanner {
25	mut:
26	text []u8
27	pos int // the position of the token in scanner text
28	line int = 1
29	col int = 1
30	}
31
32	// ReaderScanner tokenizes JSON incrementally from any io.Reader.
33	pub struct ReaderScanner {
34	mut:
35	reader &io.BufferedReader
36	peeked bool
37	ch u8
38	line int = 1
39	col int = 1
40	}
41
42	// ReaderScannerConfig configures a reader-backed JSON scanner.
43	@[params]
44	pub struct ReaderScannerConfig {
45	pub:
46	reader io.Reader
47	buffer_size int = 128 * 1024
48	}
49
50	// TokenKind identifies the kind of a JSON token.
51	pub enum TokenKind {
52	none
53	error
54	str
55	float
56	int
57	null
58	bool
59	eof
60	comma = 44 // ,
61	colon = 58 // :
62	lsbr = 91 // [
63	rsbr = 93 // ]
64	lcbr = 123 // {
65	rcbr = 125 // }
66	}
67
68	// new_scanner creates an iterative scanner for an in-memory JSON string.
69	pub fn new_scanner(text string) Scanner {
70	return Scanner{
71	text: text.bytes()
72	line: 1
73	col: 1
74	}
75	}
76
77	// new_scanner_from_bytes creates an iterative scanner for an in-memory JSON byte slice.
78	pub fn new_scanner_from_bytes(text []u8) Scanner {
79	return Scanner{
80	text: text
81	line: 1
82	col: 1
83	}
84	}
85
86	// new_reader_scanner creates an iterative scanner that reads JSON tokens from an io.Reader.
87	pub fn new_reader_scanner(config ReaderScannerConfig) &ReaderScanner {
88	return &ReaderScanner{
89	reader: io.new_buffered_reader(reader: config.reader, cap: config.buffer_size)
90	line: 1
91	col: 1
92	}
93	}
94
95	// free releases the reader scanner's internal buffer.
96	pub fn (mut s ReaderScanner) free() {
97	s.reader.free()
98	}
99
100	pub struct Token {
101	pub:
102	lit []u8 // literal representation of the token
103	kind TokenKind // the token number/enum; for quick comparisons
104	line int // the line in the source where the token occurred
105	col int // the column in the source where the token occurred
106	}
107
108	// literal returns the token contents as a string.
109	pub fn (t Token) literal() string {
110	return t.lit.bytestr()
111	}
112
113	// full_col returns the full column information which includes the length.
114	pub fn (t Token) full_col() int {
115	return t.col + t.lit.len
116	}
117
118	// is_eof reports whether the token marks the end of the JSON stream.
119	pub fn (t Token) is_eof() bool {
120	return t.kind == .eof
121	}
122
123	// list of characters commonly used in JSON.
124	const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]!
125	// list of newlines to check when moving to a new position.
126	const newlines = [`\r`, `\n`, `\t`]!
127	// list of escapable that needs to be escaped inside a JSON string.
128	// double quotes and forward slashes are excluded intentionally since
129	// they have their own separate checks for it in order to pass the
130	// JSON test suite (https://github.com/nst/JSONTestSuite/).
131	const important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]!
132	// list of valid unicode escapes aside from \u{4-hex digits}
133	const valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]!
134	// used for transforming escapes into valid unicode (eg. n => \n)
135	const unicode_transform_escapes = {
136	98: `\b`
137	102: `\f`
138	110: `\n`
139	114: `\r`
140	116: `\t`
141	92: `\\`
142	34: `"`
143	47: `/`
144	}
145	const exp_signs = [u8(`-`), `+`]!
146
147	fn new_scan_error(message string, line int, col int) JsonScanError {
148	return JsonScanError{
149	message: message
150	line: line
151	character: col
152	}
153	}
154
155	fn token_to_scan_error(token Token) JsonScanError {
156	return new_scan_error(token.literal(), token.line, token.col)
157	}
158
159	fn important_escapable_char(ch u8) ?u8 {
160	return match ch {
161	`\b` { `b` }
162	`\f` { `f` }
163	`\n` { `n` }
164	`\r` { `r` }
165	`\t` { `t` }
166	else { none }
167	}
168	}
169
170	fn invalid_token_description(ch u8) string {
171	if ch >= 32 && ch <= 126 {
172	x := ch.ascii_str()
173	return 'invalid token `${x}`'
174	} else {
175	x := ch.str_escaped()
176	return 'invalid token `${x}`'
177	}
178	}
179
180	// move_pos proceeds to the next position.
181	fn (mut s Scanner) move() {
182	s.move_pos(true, true)
183	}
184
185	// move_pos_with_newlines is the same as move_pos but only enables newline checking.
186	fn (mut s Scanner) move_pos_with_newlines() {
187	s.move_pos(false, true)
188	}
189
190	fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
191	s.pos++
192	if s.pos < s.text.len {
193	if include_newlines && s.text[s.pos] in newlines {
194	s.line++
195	s.col = 0
196	if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
197	s.pos++
198	}
199	for s.pos < s.text.len && s.text[s.pos] in newlines {
200	s.move()
201	}
202	} else if include_space && s.text[s.pos] == ` ` {
203	s.pos++
204	s.col++
205	for s.pos < s.text.len && s.text[s.pos] == ` ` {
206	s.move()
207	}
208	}
209	} else {
210	s.col++
211	}
212	}
213
214	// error returns an error token.
215	fn (s &Scanner) error(description string) Token {
216	return s.tokenize(description.bytes(), .error)
217	}
218
219	// tokenize returns a token based on the given lit and kind.
220	fn (s &Scanner) tokenize(lit []u8, kind TokenKind) Token {
221	return Token{
222	lit: lit
223	kind: kind
224	col: s.col
225	line: s.line
226	}
227	}
228
229	// text_scan scans and returns a string token.
230	@[manualfree]
231	fn (mut s Scanner) text_scan() Token {
232	mut has_closed := false
233	mut chrs := []u8{}
234	for {
235	s.pos++
236	s.col++
237	if s.pos >= s.text.len {
238	break
239	}
240	ch := s.text[s.pos]
241	if ch == `"` {
242	has_closed = true
243	break
244	} else if escaped := important_escapable_char(ch) {
245	return s.error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}')
246	} else if ch < 0x20 {
247	return s.error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}')
248	} else if ch == `\\` {
249	if s.pos == s.text.len - 1 {
250	return s.error('incomplete backslash escape at end of JSON input')
251	}
252
253	peek := s.text[s.pos + 1]
254	if peek in valid_unicode_escapes {
255	chrs << unicode_transform_escapes[int(peek)]
256	s.pos++
257	s.col++
258	continue
259	} else if peek == `u` {
260	if s.pos + 5 < s.text.len {
261	s.pos++
262	s.col++
263	mut codepoint := []u8{}
264	codepoint_start := s.pos
265	for s.pos < s.text.len && s.pos < codepoint_start + 4 {
266	s.pos++
267	s.col++
268	if s.text[s.pos] == `"` {
269	break
270	} else if !s.text[s.pos].is_hex_digit() {
271	x := s.text[s.pos].ascii_str()
272	return s.error('`${x}` is not a hex digit')
273	}
274	codepoint << s.text[s.pos]
275	}
276	if codepoint.len != 4 {
277	return s.error('unicode escape must have 4 hex digits')
278	}
279	val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
280	converted := utf32_to_str(val)
281	converted_bytes := converted.bytes()
282	chrs << converted_bytes
283	unsafe {
284	converted.free()
285	converted_bytes.free()
286	codepoint.free()
287	}
288	continue
289	} else {
290	return s.error('incomplete unicode escape')
291	}
292	} else if peek == `U` {
293	return s.error('unicode endpoints must be in lowercase `u`')
294	} else if peek == u8(229) {
295	return s.error('unicode endpoint not allowed')
296	} else {
297	return s.error('invalid backslash escape')
298	}
299	}
300	chrs << ch
301	}
302	tok := s.tokenize(chrs, .str)
303	s.move()
304	if !has_closed {
305	return s.error('missing double quotes in string closing')
306	}
307	return tok
308	}
309
310	// num_scan scans and returns an int/float token.
311	fn (mut s Scanner) num_scan() Token {
312	// analyze json number structure
313	// -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
314	mut is_fl := false
315	mut dot_index := -1
316	mut digits := []u8{}
317	if s.text[s.pos] == `-` {
318	digits << `-`
319	if s.pos + 1 >= s.text.len \|\| !s.text[s.pos + 1].is_digit() {
320	return s.invalid_token()
321	}
322	s.move_pos_with_newlines()
323	}
324	if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
325	return s.error('leading zeroes in a number are not allowed')
326	}
327	for s.pos < s.text.len && (s.text[s.pos].is_digit() \|\| (!is_fl && s.text[s.pos] == `.`)) {
328	digits << s.text[s.pos]
329	if s.text[s.pos] == `.` {
330	is_fl = true
331	dot_index = digits.len - 1
332	}
333	s.move_pos_with_newlines()
334	}
335	if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
336	return s.error('invalid float')
337	}
338	if s.pos < s.text.len && (s.text[s.pos] == `e` \|\| s.text[s.pos] == `E`) {
339	digits << s.text[s.pos]
340	s.move_pos_with_newlines()
341	if s.pos < s.text.len && s.text[s.pos] in exp_signs {
342	digits << s.text[s.pos]
343	s.move_pos_with_newlines()
344	}
345	mut exp_digits_count := 0
346	for s.pos < s.text.len && s.text[s.pos].is_digit() {
347	digits << s.text[s.pos]
348	exp_digits_count++
349	s.move_pos_with_newlines()
350	}
351	if exp_digits_count == 0 {
352	return s.error('invalid exponent')
353	}
354	}
355	kind := if is_fl { TokenKind.float } else { TokenKind.int }
356	return s.tokenize(digits, kind)
357	}
358
359	// invalid_token returns an error token with the invalid token message.
360	fn (s &Scanner) invalid_token() Token {
361	return s.error(invalid_token_description(s.text[s.pos]))
362	}
363
364	// next returns the next JSON token from the in-memory scanner.
365	pub fn (mut s Scanner) next() !Token {
366	tok := s.scan()
367	if tok.kind == .error {
368	return token_to_scan_error(tok)
369	}
370	return tok
371	}
372
373	// scan returns a token based on the scanner's current position.
374	// used to set the next token
375	@[manualfree]
376	fn (mut s Scanner) scan() Token {
377	if s.pos < s.text.len && (s.text[s.pos] == ` ` \|\| s.text[s.pos] in newlines) {
378	s.move()
379	}
380	if s.pos >= s.text.len {
381	return s.tokenize([]u8{}, .eof)
382	} else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` \|\| s.text[s.pos] == `n`) {
383	ident := s.text[s.pos..s.pos + 4].bytestr()
384	if ident == 'true' \|\| ident == 'null' {
385	mut kind := TokenKind.null
386	if ident == 'true' {
387	kind = .bool
388	}
389	unsafe { ident.free() }
390	val := s.text[s.pos..s.pos + 4]
391	tok := s.tokenize(val, kind)
392	s.move() // n / t
393	s.move() // u / r
394	s.move() // l / u
395	s.move() // l / e
396	return tok
397	}
398	unsafe { ident.free() }
399	return s.invalid_token()
400	} else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
401	ident := s.text[s.pos..s.pos + 5].bytestr()
402	if ident == 'false' {
403	unsafe { ident.free() }
404	val := s.text[s.pos..s.pos + 5]
405	tok := s.tokenize(val, .bool)
406	s.move() // f
407	s.move() // a
408	s.move() // l
409	s.move() // s
410	s.move() // e
411	return tok
412	}
413	unsafe { ident.free() }
414	return s.invalid_token()
415	} else if s.text[s.pos] in char_list {
416	chr := s.text[s.pos]
417	tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) })
418	s.move()
419	return tok
420	} else if s.text[s.pos] == `"` {
421	return s.text_scan()
422	} else if s.text[s.pos].is_digit() \|\| s.text[s.pos] == `-` {
423	return s.num_scan()
424	} else {
425	return s.invalid_token()
426	}
427	}
428
429	fn (mut s ReaderScanner) tokenize(lit []u8, kind TokenKind, line int, col int) Token {
430	return Token{
431	lit: lit
432	kind: kind
433	line: line
434	col: col
435	}
436	}
437
438	fn (mut s ReaderScanner) has_next_byte() !bool {
439	if s.peeked {
440	return true
441	}
442	mut buf := [u8(0)]
443	n := s.reader.read(mut buf) or {
444	if err is io.Eof {
445	return false
446	}
447	return err
448	}
449	if n == 0 {
450	return false
451	}
452	s.ch = buf[0]
453	s.peeked = true
454	return true
455	}
456
457	fn (mut s ReaderScanner) peek_byte() !u8 {
458	if !s.has_next_byte()! {
459	return io.Eof{}
460	}
461	return s.ch
462	}
463
464	fn (mut s ReaderScanner) advance_position(ch u8) ! {
465	if ch == `\r` {
466	if s.has_next_byte()! && s.ch == `\n` {
467	s.peeked = false
468	}
469	}
470	if ch in newlines {
471	s.line++
472	s.col = 1
473	return
474	}
475	s.col++
476	}
477
478	fn (mut s ReaderScanner) read_byte() !u8 {
479	ch := s.peek_byte()!
480	s.peeked = false
481	s.advance_position(ch)!
482	return ch
483	}
484
485	fn (mut s ReaderScanner) skip_whitespace() ! {
486	for {
487	if !s.has_next_byte()! {
488	return
489	}
490	ch := s.ch
491	if ch == ` ` \|\| ch in newlines {
492	_ = s.read_byte()!
493	continue
494	}
495	return
496	}
497	}
498
499	fn (mut s ReaderScanner) scan_ident(ident string, kind TokenKind, line int, col int) !Token {
500	mut lit := []u8{}
501	for expected in ident.bytes() {
502	current_line, current_col := s.line, s.col
503	ch := s.read_byte() or {
504	if err is io.Eof {
505	return new_scan_error('unexpected end of JSON input', current_line, current_col)
506	}
507	return err
508	}
509	if ch != expected {
510	return new_scan_error(invalid_token_description(ch), current_line, current_col)
511	}
512	lit << ch
513	}
514	return s.tokenize(lit, kind, line, col)
515	}
516
517	@[manualfree]
518	fn (mut s ReaderScanner) text_scan(line int, col int) !Token {
519	mut chrs := []u8{}
520	_ = s.read_byte()! // opening quote
521	for {
522	current_line, current_col := s.line, s.col
523	if !s.has_next_byte()! {
524	return new_scan_error('missing double quotes in string closing', line, col)
525	}
526	ch := s.ch
527	if ch == `"` {
528	_ = s.read_byte()!
529	break
530	} else if escaped := important_escapable_char(ch) {
531	return new_scan_error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}',
532	current_line, current_col)
533	} else if ch < 0x20 {
534	return new_scan_error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}',
535	current_line, current_col)
536	} else if ch == `\\` {
537	_ = s.read_byte()!
538	escape_line, escape_col := s.line, s.col
539	if !s.has_next_byte()! {
540	return new_scan_error('incomplete backslash escape at end of JSON input',
541	escape_line, escape_col)
542	}
543	peek := s.ch
544	if peek in valid_unicode_escapes {
545	chrs << unicode_transform_escapes[int(peek)]
546	_ = s.read_byte()!
547	continue
548	} else if peek == `u` {
549	_ = s.read_byte()!
550	mut codepoint := []u8{}
551	for _ in 0 .. 4 {
552	digit_line, digit_col := s.line, s.col
553	if !s.has_next_byte()! {
554	return new_scan_error('incomplete unicode escape', escape_line, escape_col)
555	}
556	digit := s.ch
557	if digit == `"` {
558	return new_scan_error('unicode escape must have 4 hex digits', digit_line,
559	digit_col)
560	} else if !digit.is_hex_digit() {
561	x := digit.ascii_str()
562	return new_scan_error('`${x}` is not a hex digit', digit_line, digit_col)
563	}
564	codepoint << digit
565	_ = s.read_byte()!
566	}
567	val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
568	converted := utf32_to_str(val)
569	converted_bytes := converted.bytes()
570	chrs << converted_bytes
571	unsafe {
572	converted.free()
573	converted_bytes.free()
574	codepoint.free()
575	}
576	continue
577	} else if peek == `U` {
578	return new_scan_error('unicode endpoints must be in lowercase `u`', escape_line,
579	escape_col)
580	} else if peek == u8(229) {
581	return new_scan_error('unicode endpoint not allowed', escape_line, escape_col)
582	} else {
583	return new_scan_error('invalid backslash escape', escape_line, escape_col)
584	}
585	}
586	chrs << ch
587	_ = s.read_byte()!
588	}
589	return s.tokenize(chrs, .str, line, col)
590	}
591
592	fn (mut s ReaderScanner) num_scan(line int, col int) !Token {
593	mut is_fl := false
594	mut dot_index := -1
595	mut digits := []u8{}
596	if s.peek_byte()! == `-` {
597	digits << `-`
598	_ = s.read_byte()!
599	if !s.has_next_byte()! {
600	return new_scan_error('invalid token `-`', line, col)
601	}
602	next := s.ch
603	if !next.is_digit() {
604	return new_scan_error(invalid_token_description(next), s.line, s.col)
605	}
606	}
607	if s.has_next_byte()! {
608	first := s.ch
609	if first == `0` {
610	digits << first
611	_ = s.read_byte()!
612	if s.has_next_byte()! && s.ch.is_digit() {
613	return new_scan_error('leading zeroes in a number are not allowed', line, col)
614	}
615	}
616	}
617	for {
618	if !s.has_next_byte()! {
619	break
620	}
621	ch := s.ch
622	if ch.is_digit() \|\| (!is_fl && ch == `.`) {
623	digits << ch
624	if ch == `.` {
625	is_fl = true
626	dot_index = digits.len - 1
627	}
628	_ = s.read_byte()!
629	continue
630	}
631	break
632	}
633	if dot_index != -1 && digits[dot_index + 1..].len == 0 {
634	return new_scan_error('invalid float', line, col)
635	}
636	if s.has_next_byte()! {
637	ch := s.ch
638	if ch == `e` \|\| ch == `E` {
639	digits << ch
640	_ = s.read_byte()!
641	if s.has_next_byte()! && s.ch in exp_signs {
642	digits << s.ch
643	_ = s.read_byte()!
644	}
645	mut exp_digits_count := 0
646	for {
647	if !s.has_next_byte()! {
648	break
649	}
650	digit := s.ch
651	if !digit.is_digit() {
652	break
653	}
654	digits << digit
655	exp_digits_count++
656	_ = s.read_byte()!
657	}
658	if exp_digits_count == 0 {
659	return new_scan_error('invalid exponent', line, col)
660	}
661	}
662	}
663	kind := if is_fl { TokenKind.float } else { TokenKind.int }
664	return s.tokenize(digits, kind, line, col)
665	}
666
667	// next returns the next JSON token from the reader-backed scanner.
668	pub fn (mut s ReaderScanner) next() !Token {
669	s.skip_whitespace()!
670	line, col := s.line, s.col
671	if !s.has_next_byte()! {
672	return s.tokenize([]u8{}, .eof, line, col)
673	}
674	ch := s.ch
675	if ch == `t` \|\| ch == `n` {
676	ident := if ch == `t` { 'true' } else { 'null' }
677	kind := if ch == `t` { TokenKind.bool } else { TokenKind.null }
678	return s.scan_ident(ident, kind, line, col)
679	} else if ch == `f` {
680	return s.scan_ident('false', .bool, line, col)
681	} else if ch in char_list {
682	_ = s.read_byte()!
683	return s.tokenize([]u8{}, unsafe { TokenKind(int(ch)) }, line, col)
684	} else if ch == `"` {
685	return s.text_scan(line, col)
686	} else if ch.is_digit() \|\| ch == `-` {
687	return s.num_scan(line, col)
688	}
689	return new_scan_error(invalid_token_description(ch), line, col)
690	}
691