From 4afd2d47afc62f41b90142942b357edcf5956941 Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Wed, 25 Mar 2026 22:30:55 +0300 Subject: [PATCH] json2: fix iterative json parser (fixes #8986) --- vlib/x/json2/README.md | 51 ++++ vlib/x/json2/scanner.v | 391 +++++++++++++++++++++++++++++- vlib/x/json2/tests/scanner_test.v | 102 ++++++++ 3 files changed, 533 insertions(+), 11 deletions(-) create mode 100644 vlib/x/json2/tests/scanner_test.v diff --git a/vlib/x/json2/README.md b/vlib/x/json2/README.md index ae8dfa1fd..43fd152ef 100644 --- a/vlib/x/json2/README.md +++ b/vlib/x/json2/README.md @@ -80,6 +80,57 @@ fn main() { } ``` +#### iterative token scanning + +`x.json2` now exposes low-level scanners that let you process JSON token by +token instead of materializing the whole tree first. + +Use `new_scanner()` for in-memory strings: + +```v +import x.json2 + +fn main() { + mut scanner := json2.new_scanner('{"items":[1,2,3]}') + for { + token := scanner.next()! + if token.is_eof() { + break + } + println('${token.kind}: ${token.literal()}') + } +} +``` + +Use `new_reader_scanner()` to stream tokens from a file or any `io.Reader`: + +```v +import os +import x.json2 + +fn main() { + mut file := os.open('huge.json')! + defer { + file.close() + } + + mut scanner := json2.new_reader_scanner(reader: file) + defer { + scanner.free() + } + + for { + token := scanner.next()! + if token.is_eof() { + break + } + if token.kind == .str && token.literal() == 'id' { + println('found an id key') + } + } +} +``` + #### Casting `Any` type / Navigating ```v diff --git a/vlib/x/json2/scanner.v b/vlib/x/json2/scanner.v index 0ae549185..cb43326e1 100644 --- a/vlib/x/json2/scanner.v +++ b/vlib/x/json2/scanner.v @@ -3,17 +3,52 @@ // that can be found in the LICENSE file. module json2 +import io import strconv -struct Scanner { +// JsonScanError describes a tokenization error reported by the iterative scanner APIs. +pub struct JsonScanError { + Error +pub: + message string + + line int + character int +} + +fn (e JsonScanError) msg() string { + return '${e.line}:${e.character}: Invalid json token: ${e.message}' +} + +// Scanner tokenizes JSON from an in-memory string or byte slice. +pub struct Scanner { mut: text []u8 pos int // the position of the token in scanner text - line int - col int + line int = 1 + col int = 1 } -enum TokenKind { +// ReaderScanner tokenizes JSON incrementally from any io.Reader. +pub struct ReaderScanner { +mut: + reader &io.BufferedReader + peeked bool + ch u8 + line int = 1 + col int = 1 +} + +// ReaderScannerConfig configures a reader-backed JSON scanner. +@[params] +pub struct ReaderScannerConfig { +pub: + reader io.Reader + buffer_size int = 128 * 1024 +} + +// TokenKind identifies the kind of a JSON token. +pub enum TokenKind { none error str @@ -30,18 +65,61 @@ enum TokenKind { rcbr = 125 // } } +// new_scanner creates an iterative scanner for an in-memory JSON string. +pub fn new_scanner(text string) Scanner { + return Scanner{ + text: text.bytes() + line: 1 + col: 1 + } +} + +// new_scanner_from_bytes creates an iterative scanner for an in-memory JSON byte slice. +pub fn new_scanner_from_bytes(text []u8) Scanner { + return Scanner{ + text: text + line: 1 + col: 1 + } +} + +// new_reader_scanner creates an iterative scanner that reads JSON tokens from an io.Reader. +pub fn new_reader_scanner(config ReaderScannerConfig) &ReaderScanner { + return &ReaderScanner{ + reader: io.new_buffered_reader(reader: config.reader, cap: config.buffer_size) + line: 1 + col: 1 + } +} + +// free releases the reader scanner's internal buffer. +pub fn (mut s ReaderScanner) free() { + s.reader.free() +} + pub struct Token { +pub: lit []u8 // literal representation of the token kind TokenKind // the token number/enum; for quick comparisons line int // the line in the source where the token occurred col int // the column in the source where the token occurred } +// literal returns the token contents as a string. +pub fn (t Token) literal() string { + return t.lit.bytestr() +} + // full_col returns the full column information which includes the length. pub fn (t Token) full_col() int { return t.col + t.lit.len } +// is_eof reports whether the token marks the end of the JSON stream. +pub fn (t Token) is_eof() bool { + return t.kind == .eof +} + // list of characters commonly used in JSON. const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]! // list of newlines to check when moving to a new position. @@ -66,6 +144,18 @@ const unicode_transform_escapes = { } const exp_signs = [u8(`-`), `+`]! +fn new_scan_error(message string, line int, col int) JsonScanError { + return JsonScanError{ + message: message + line: line + character: col + } +} + +fn token_to_scan_error(token Token) JsonScanError { + return new_scan_error(token.literal(), token.line, token.col) +} + fn important_escapable_char(ch u8) ?u8 { return match ch { `\b` { `b` } @@ -77,6 +167,16 @@ fn important_escapable_char(ch u8) ?u8 { } } +fn invalid_token_description(ch u8) string { + if ch >= 32 && ch <= 126 { + x := ch.ascii_str() + return 'invalid token `${x}`' + } else { + x := ch.str_escaped() + return 'invalid token `${x}`' + } +} + // move_pos proceeds to the next position. fn (mut s Scanner) move() { s.move_pos(true, true) @@ -216,7 +316,7 @@ fn (mut s Scanner) num_scan() Token { mut digits := []u8{} if s.text[s.pos] == `-` { digits << `-` - if !s.text[s.pos + 1].is_digit() { + if s.pos + 1 >= s.text.len || !s.text[s.pos + 1].is_digit() { return s.invalid_token() } s.move_pos_with_newlines() @@ -258,13 +358,16 @@ fn (mut s Scanner) num_scan() Token { // invalid_token returns an error token with the invalid token message. fn (s &Scanner) invalid_token() Token { - if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 { - x := s.text[s.pos].ascii_str() - return s.error('invalid token `${x}`') - } else { - x := s.text[s.pos].str_escaped() - return s.error('invalid token `${x}`') + return s.error(invalid_token_description(s.text[s.pos])) +} + +// next returns the next JSON token from the in-memory scanner. +pub fn (mut s Scanner) next() !Token { + tok := s.scan() + if tok.kind == .error { + return token_to_scan_error(tok) } + return tok } // scan returns a token based on the scanner's current position. @@ -322,3 +425,269 @@ fn (mut s Scanner) scan() Token { return s.invalid_token() } } + +fn (mut s ReaderScanner) tokenize(lit []u8, kind TokenKind, line int, col int) Token { + return Token{ + lit: lit + kind: kind + line: line + col: col + } +} + +fn (mut s ReaderScanner) has_next_byte() !bool { + if s.peeked { + return true + } + mut buf := [u8(0)] + n := s.reader.read(mut buf) or { + if err is io.Eof { + return false + } + return err + } + if n == 0 { + return false + } + s.ch = buf[0] + s.peeked = true + return true +} + +fn (mut s ReaderScanner) peek_byte() !u8 { + if !s.has_next_byte()! { + return io.Eof{} + } + return s.ch +} + +fn (mut s ReaderScanner) advance_position(ch u8) ! { + if ch == `\r` { + if s.has_next_byte()! && s.ch == `\n` { + s.peeked = false + } + } + if ch in newlines { + s.line++ + s.col = 1 + return + } + s.col++ +} + +fn (mut s ReaderScanner) read_byte() !u8 { + ch := s.peek_byte()! + s.peeked = false + s.advance_position(ch)! + return ch +} + +fn (mut s ReaderScanner) skip_whitespace() ! { + for { + if !s.has_next_byte()! { + return + } + ch := s.ch + if ch == ` ` || ch in newlines { + _ = s.read_byte()! + continue + } + return + } +} + +fn (mut s ReaderScanner) scan_ident(ident string, kind TokenKind, line int, col int) !Token { + mut lit := []u8{} + for expected in ident.bytes() { + current_line, current_col := s.line, s.col + ch := s.read_byte() or { + if err is io.Eof { + return new_scan_error('unexpected end of JSON input', current_line, current_col) + } + return err + } + if ch != expected { + return new_scan_error(invalid_token_description(ch), current_line, current_col) + } + lit << ch + } + return s.tokenize(lit, kind, line, col) +} + +@[manualfree] +fn (mut s ReaderScanner) text_scan(line int, col int) !Token { + mut chrs := []u8{} + _ = s.read_byte()! // opening quote + for { + current_line, current_col := s.line, s.col + if !s.has_next_byte()! { + return new_scan_error('missing double quotes in string closing', line, col) + } + ch := s.ch + if ch == `"` { + _ = s.read_byte()! + break + } else if escaped := important_escapable_char(ch) { + return new_scan_error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}', + current_line, current_col) + } else if ch < 0x20 { + return new_scan_error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}', + current_line, current_col) + } else if ch == `\\` { + _ = s.read_byte()! + escape_line, escape_col := s.line, s.col + if !s.has_next_byte()! { + return new_scan_error('incomplete backslash escape at end of JSON input', + escape_line, escape_col) + } + peek := s.ch + if peek in valid_unicode_escapes { + chrs << unicode_transform_escapes[int(peek)] + _ = s.read_byte()! + continue + } else if peek == `u` { + _ = s.read_byte()! + mut codepoint := []u8{} + for _ in 0 .. 4 { + digit_line, digit_col := s.line, s.col + if !s.has_next_byte()! { + return new_scan_error('incomplete unicode escape', escape_line, + escape_col) + } + digit := s.ch + if digit == `"` { + return new_scan_error('unicode escape must have 4 hex digits', + digit_line, digit_col) + } else if !digit.is_hex_digit() { + x := digit.ascii_str() + return new_scan_error('`${x}` is not a hex digit', digit_line, + digit_col) + } + codepoint << digit + _ = s.read_byte()! + } + val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 }) + converted := utf32_to_str(val) + converted_bytes := converted.bytes() + chrs << converted_bytes + unsafe { + converted.free() + converted_bytes.free() + codepoint.free() + } + continue + } else if peek == `U` { + return new_scan_error('unicode endpoints must be in lowercase `u`', escape_line, + escape_col) + } else if peek == u8(229) { + return new_scan_error('unicode endpoint not allowed', escape_line, escape_col) + } else { + return new_scan_error('invalid backslash escape', escape_line, escape_col) + } + } + chrs << ch + _ = s.read_byte()! + } + return s.tokenize(chrs, .str, line, col) +} + +fn (mut s ReaderScanner) num_scan(line int, col int) !Token { + mut is_fl := false + mut dot_index := -1 + mut digits := []u8{} + if s.peek_byte()! == `-` { + digits << `-` + _ = s.read_byte()! + if !s.has_next_byte()! { + return new_scan_error('invalid token `-`', line, col) + } + next := s.ch + if !next.is_digit() { + return new_scan_error(invalid_token_description(next), s.line, s.col) + } + } + if s.has_next_byte()! { + first := s.ch + if first == `0` { + digits << first + _ = s.read_byte()! + if s.has_next_byte()! && s.ch.is_digit() { + return new_scan_error('leading zeroes in a number are not allowed', line, + col) + } + } + } + for { + if !s.has_next_byte()! { + break + } + ch := s.ch + if ch.is_digit() || (!is_fl && ch == `.`) { + digits << ch + if ch == `.` { + is_fl = true + dot_index = digits.len - 1 + } + _ = s.read_byte()! + continue + } + break + } + if dot_index != -1 && digits[dot_index + 1..].len == 0 { + return new_scan_error('invalid float', line, col) + } + if s.has_next_byte()! { + ch := s.ch + if ch == `e` || ch == `E` { + digits << ch + _ = s.read_byte()! + if s.has_next_byte()! && s.ch in exp_signs { + digits << s.ch + _ = s.read_byte()! + } + mut exp_digits_count := 0 + for { + if !s.has_next_byte()! { + break + } + digit := s.ch + if !digit.is_digit() { + break + } + digits << digit + exp_digits_count++ + _ = s.read_byte()! + } + if exp_digits_count == 0 { + return new_scan_error('invalid exponent', line, col) + } + } + } + kind := if is_fl { TokenKind.float } else { TokenKind.int } + return s.tokenize(digits, kind, line, col) +} + +// next returns the next JSON token from the reader-backed scanner. +pub fn (mut s ReaderScanner) next() !Token { + s.skip_whitespace()! + line, col := s.line, s.col + if !s.has_next_byte()! { + return s.tokenize([]u8{}, .eof, line, col) + } + ch := s.ch + if ch == `t` || ch == `n` { + ident := if ch == `t` { 'true' } else { 'null' } + kind := if ch == `t` { TokenKind.bool } else { TokenKind.null } + return s.scan_ident(ident, kind, line, col) + } else if ch == `f` { + return s.scan_ident('false', .bool, line, col) + } else if ch in char_list { + _ = s.read_byte()! + return s.tokenize([]u8{}, unsafe { TokenKind(int(ch)) }, line, col) + } else if ch == `"` { + return s.text_scan(line, col) + } else if ch.is_digit() || ch == `-` { + return s.num_scan(line, col) + } + return new_scan_error(invalid_token_description(ch), line, col) +} diff --git a/vlib/x/json2/tests/scanner_test.v b/vlib/x/json2/tests/scanner_test.v new file mode 100644 index 000000000..5e94446c8 --- /dev/null +++ b/vlib/x/json2/tests/scanner_test.v @@ -0,0 +1,102 @@ +import io +import x.json2 as json + +struct ChunkedReader { + data []u8 + chunk_size int +mut: + pos int +} + +fn (mut r ChunkedReader) read(mut buf []u8) !int { + if r.pos >= r.data.len { + return io.Eof{} + } + mut n := r.chunk_size + remaining := r.data.len - r.pos + if n > remaining { + n = remaining + } + if n > buf.len { + n = buf.len + } + read := copy(mut buf[..n], r.data[r.pos..r.pos + n]) + r.pos += read + return read +} + +fn test_public_scanner_iterates_tokens() { + mut scanner := json.new_scanner('{"items":[1,true,null,"x"]}') + mut got := []string{} + for { + token := scanner.next()! + got << '${token.kind}:${token.literal()}' + if token.is_eof() { + break + } + } + assert got == [ + 'lcbr:', + 'str:items', + 'colon:', + 'lsbr:', + 'int:1', + 'comma:', + 'bool:true', + 'comma:', + 'null:null', + 'comma:', + 'str:x', + 'rsbr:', + 'rcbr:', + 'eof:', + ] +} + +fn test_reader_scanner_iterates_tokens() { + mut reader := ChunkedReader{ + data: '[10,true,"chunked",null]'.bytes() + chunk_size: 2 + } + mut scanner := json.new_reader_scanner(reader: reader, buffer_size: 3) + defer { + scanner.free() + } + mut got := []string{} + for { + token := scanner.next()! + got << '${token.kind}:${token.literal()}' + if token.is_eof() { + break + } + } + assert got == [ + 'lsbr:', + 'int:10', + 'comma:', + 'bool:true', + 'comma:', + 'str:chunked', + 'comma:', + 'null:null', + 'rsbr:', + 'eof:', + ] +} + +fn test_reader_scanner_reports_errors() { + mut reader := ChunkedReader{ + data: r'["\z"]'.bytes() + chunk_size: 1 + } + mut scanner := json.new_reader_scanner(reader: reader, buffer_size: 1) + defer { + scanner.free() + } + assert scanner.next()!.kind == .lsbr + scanner.next() or { + assert err.msg().contains('invalid backslash escape') + return + } + assert false +} -- 2.39.5