From 4afd2d47afc62f41b90142942b357edcf5956941 Mon Sep 17 00:00:00 2001
From: Alexander Medvednikov <alexander@medvednikov.com>
Date: Wed, 25 Mar 2026 22:30:55 +0300
Subject: [PATCH] json2: fix iterative json parser (fixes #8986)

---
 vlib/x/json2/README.md            |  51 ++++
 vlib/x/json2/scanner.v            | 391 +++++++++++++++++++++++++++++-
 vlib/x/json2/tests/scanner_test.v | 102 ++++++++
 3 files changed, 533 insertions(+), 11 deletions(-)
 create mode 100644 vlib/x/json2/tests/scanner_test.v

diff --git a/vlib/x/json2/README.md b/vlib/x/json2/README.md
index ae8dfa1fd..43fd152ef 100644
--- a/vlib/x/json2/README.md
+++ b/vlib/x/json2/README.md
@@ -80,6 +80,57 @@ fn main() {
 }
 ```
 
+#### iterative token scanning
+
+`x.json2` now exposes low-level scanners that let you process JSON token by
+token instead of materializing the whole tree first.
+
+Use `new_scanner()` for in-memory strings:
+
+```v
+import x.json2
+
+fn main() {
+	mut scanner := json2.new_scanner('{"items":[1,2,3]}')
+	for {
+		token := scanner.next()!
+		if token.is_eof() {
+			break
+		}
+		println('${token.kind}: ${token.literal()}')
+	}
+}
+```
+
+Use `new_reader_scanner()` to stream tokens from a file or any `io.Reader`:
+
+```v
+import os
+import x.json2
+
+fn main() {
+	mut file := os.open('huge.json')!
+	defer {
+		file.close()
+	}
+
+	mut scanner := json2.new_reader_scanner(reader: file)
+	defer {
+		scanner.free()
+	}
+
+	for {
+		token := scanner.next()!
+		if token.is_eof() {
+			break
+		}
+		if token.kind == .str && token.literal() == 'id' {
+			println('found an id key')
+		}
+	}
+}
+```
+
 #### Casting `Any` type / Navigating
 
 ```v
diff --git a/vlib/x/json2/scanner.v b/vlib/x/json2/scanner.v
index 0ae549185..cb43326e1 100644
--- a/vlib/x/json2/scanner.v
+++ b/vlib/x/json2/scanner.v
@@ -3,17 +3,52 @@
 // that can be found in the LICENSE file.
 module json2
 
+import io
 import strconv
 
-struct Scanner {
+// JsonScanError describes a tokenization error reported by the iterative scanner APIs.
+pub struct JsonScanError {
+	Error
+pub:
+	message string
+
+	line      int
+	character int
+}
+
+fn (e JsonScanError) msg() string {
+	return '${e.line}:${e.character}: Invalid json token: ${e.message}'
+}
+
+// Scanner tokenizes JSON from an in-memory string or byte slice.
+pub struct Scanner {
 mut:
 	text []u8
 	pos  int // the position of the token in scanner text
-	line int
-	col  int
+	line int = 1
+	col  int = 1
 }
 
-enum TokenKind {
+// ReaderScanner tokenizes JSON incrementally from any io.Reader.
+pub struct ReaderScanner {
+mut:
+	reader &io.BufferedReader
+	peeked bool
+	ch     u8
+	line   int = 1
+	col    int = 1
+}
+
+// ReaderScannerConfig configures a reader-backed JSON scanner.
+@[params]
+pub struct ReaderScannerConfig {
+pub:
+	reader      io.Reader
+	buffer_size int = 128 * 1024
+}
+
+// TokenKind identifies the kind of a JSON token.
+pub enum TokenKind {
 	none
 	error
 	str
@@ -30,18 +65,61 @@ enum TokenKind {
 	rcbr  = 125 // }
 }
 
+// new_scanner creates an iterative scanner for an in-memory JSON string.
+pub fn new_scanner(text string) Scanner {
+	return Scanner{
+		text: text.bytes()
+		line: 1
+		col:  1
+	}
+}
+
+// new_scanner_from_bytes creates an iterative scanner for an in-memory JSON byte slice.
+pub fn new_scanner_from_bytes(text []u8) Scanner {
+	return Scanner{
+		text: text
+		line: 1
+		col:  1
+	}
+}
+
+// new_reader_scanner creates an iterative scanner that reads JSON tokens from an io.Reader.
+pub fn new_reader_scanner(config ReaderScannerConfig) &ReaderScanner {
+	return &ReaderScanner{
+		reader: io.new_buffered_reader(reader: config.reader, cap: config.buffer_size)
+		line:   1
+		col:    1
+	}
+}
+
+// free releases the reader scanner's internal buffer.
+pub fn (mut s ReaderScanner) free() {
+	s.reader.free()
+}
+
 pub struct Token {
+pub:
 	lit  []u8      // literal representation of the token
 	kind TokenKind // the token number/enum; for quick comparisons
 	line int       // the line in the source where the token occurred
 	col  int       // the column in the source where the token occurred
 }
 
+// literal returns the token contents as a string.
+pub fn (t Token) literal() string {
+	return t.lit.bytestr()
+}
+
 // full_col returns the full column information which includes the length.
 pub fn (t Token) full_col() int {
 	return t.col + t.lit.len
 }
 
+// is_eof reports whether the token marks the end of the JSON stream.
+pub fn (t Token) is_eof() bool {
+	return t.kind == .eof
+}
+
 // list of characters commonly used in JSON.
 const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]!
 // list of newlines to check when moving to a new position.
@@ -66,6 +144,18 @@ const unicode_transform_escapes = {
 }
 const exp_signs = [u8(`-`), `+`]!
 
+fn new_scan_error(message string, line int, col int) JsonScanError {
+	return JsonScanError{
+		message:   message
+		line:      line
+		character: col
+	}
+}
+
+fn token_to_scan_error(token Token) JsonScanError {
+	return new_scan_error(token.literal(), token.line, token.col)
+}
+
 fn important_escapable_char(ch u8) ?u8 {
 	return match ch {
 		`\b` { `b` }
@@ -77,6 +167,16 @@ fn important_escapable_char(ch u8) ?u8 {
 	}
 }
 
+fn invalid_token_description(ch u8) string {
+	if ch >= 32 && ch <= 126 {
+		x := ch.ascii_str()
+		return 'invalid token `${x}`'
+	} else {
+		x := ch.str_escaped()
+		return 'invalid token `${x}`'
+	}
+}
+
 // move_pos proceeds to the next position.
 fn (mut s Scanner) move() {
 	s.move_pos(true, true)
@@ -216,7 +316,7 @@ fn (mut s Scanner) num_scan() Token {
 	mut digits := []u8{}
 	if s.text[s.pos] == `-` {
 		digits << `-`
-		if !s.text[s.pos + 1].is_digit() {
+		if s.pos + 1 >= s.text.len || !s.text[s.pos + 1].is_digit() {
 			return s.invalid_token()
 		}
 		s.move_pos_with_newlines()
@@ -258,13 +358,16 @@ fn (mut s Scanner) num_scan() Token {
 
 // invalid_token returns an error token with the invalid token message.
 fn (s &Scanner) invalid_token() Token {
-	if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 {
-		x := s.text[s.pos].ascii_str()
-		return s.error('invalid token `${x}`')
-	} else {
-		x := s.text[s.pos].str_escaped()
-		return s.error('invalid token `${x}`')
+	return s.error(invalid_token_description(s.text[s.pos]))
+}
+
+// next returns the next JSON token from the in-memory scanner.
+pub fn (mut s Scanner) next() !Token {
+	tok := s.scan()
+	if tok.kind == .error {
+		return token_to_scan_error(tok)
 	}
+	return tok
 }
 
 // scan returns a token based on the scanner's current position.
@@ -322,3 +425,269 @@ fn (mut s Scanner) scan() Token {
 		return s.invalid_token()
 	}
 }
+
+fn (mut s ReaderScanner) tokenize(lit []u8, kind TokenKind, line int, col int) Token {
+	return Token{
+		lit:  lit
+		kind: kind
+		line: line
+		col:  col
+	}
+}
+
+fn (mut s ReaderScanner) has_next_byte() !bool {
+	if s.peeked {
+		return true
+	}
+	mut buf := [u8(0)]
+	n := s.reader.read(mut buf) or {
+		if err is io.Eof {
+			return false
+		}
+		return err
+	}
+	if n == 0 {
+		return false
+	}
+	s.ch = buf[0]
+	s.peeked = true
+	return true
+}
+
+fn (mut s ReaderScanner) peek_byte() !u8 {
+	if !s.has_next_byte()! {
+		return io.Eof{}
+	}
+	return s.ch
+}
+
+fn (mut s ReaderScanner) advance_position(ch u8) ! {
+	if ch == `\r` {
+		if s.has_next_byte()! && s.ch == `\n` {
+			s.peeked = false
+		}
+	}
+	if ch in newlines {
+		s.line++
+		s.col = 1
+		return
+	}
+	s.col++
+}
+
+fn (mut s ReaderScanner) read_byte() !u8 {
+	ch := s.peek_byte()!
+	s.peeked = false
+	s.advance_position(ch)!
+	return ch
+}
+
+fn (mut s ReaderScanner) skip_whitespace() ! {
+	for {
+		if !s.has_next_byte()! {
+			return
+		}
+		ch := s.ch
+		if ch == ` ` || ch in newlines {
+			_ = s.read_byte()!
+			continue
+		}
+		return
+	}
+}
+
+fn (mut s ReaderScanner) scan_ident(ident string, kind TokenKind, line int, col int) !Token {
+	mut lit := []u8{}
+	for expected in ident.bytes() {
+		current_line, current_col := s.line, s.col
+		ch := s.read_byte() or {
+			if err is io.Eof {
+				return new_scan_error('unexpected end of JSON input', current_line, current_col)
+			}
+			return err
+		}
+		if ch != expected {
+			return new_scan_error(invalid_token_description(ch), current_line, current_col)
+		}
+		lit << ch
+	}
+	return s.tokenize(lit, kind, line, col)
+}
+
+@[manualfree]
+fn (mut s ReaderScanner) text_scan(line int, col int) !Token {
+	mut chrs := []u8{}
+	_ = s.read_byte()! // opening quote
+	for {
+		current_line, current_col := s.line, s.col
+		if !s.has_next_byte()! {
+			return new_scan_error('missing double quotes in string closing', line, col)
+		}
+		ch := s.ch
+		if ch == `"` {
+			_ = s.read_byte()!
+			break
+		} else if escaped := important_escapable_char(ch) {
+			return new_scan_error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}',
+				current_line, current_col)
+		} else if ch < 0x20 {
+			return new_scan_error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}',
+				current_line, current_col)
+		} else if ch == `\\` {
+			_ = s.read_byte()!
+			escape_line, escape_col := s.line, s.col
+			if !s.has_next_byte()! {
+				return new_scan_error('incomplete backslash escape at end of JSON input',
+					escape_line, escape_col)
+			}
+			peek := s.ch
+			if peek in valid_unicode_escapes {
+				chrs << unicode_transform_escapes[int(peek)]
+				_ = s.read_byte()!
+				continue
+			} else if peek == `u` {
+				_ = s.read_byte()!
+				mut codepoint := []u8{}
+				for _ in 0 .. 4 {
+					digit_line, digit_col := s.line, s.col
+					if !s.has_next_byte()! {
+						return new_scan_error('incomplete unicode escape', escape_line,
+							escape_col)
+					}
+					digit := s.ch
+					if digit == `"` {
+						return new_scan_error('unicode escape must have 4 hex digits',
+							digit_line, digit_col)
+					} else if !digit.is_hex_digit() {
+						x := digit.ascii_str()
+						return new_scan_error('`${x}` is not a hex digit', digit_line,
+							digit_col)
+					}
+					codepoint << digit
+					_ = s.read_byte()!
+				}
+				val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
+				converted := utf32_to_str(val)
+				converted_bytes := converted.bytes()
+				chrs << converted_bytes
+				unsafe {
+					converted.free()
+					converted_bytes.free()
+					codepoint.free()
+				}
+				continue
+			} else if peek == `U` {
+				return new_scan_error('unicode endpoints must be in lowercase `u`', escape_line,
+					escape_col)
+			} else if peek == u8(229) {
+				return new_scan_error('unicode endpoint not allowed', escape_line, escape_col)
+			} else {
+				return new_scan_error('invalid backslash escape', escape_line, escape_col)
+			}
+		}
+		chrs << ch
+		_ = s.read_byte()!
+	}
+	return s.tokenize(chrs, .str, line, col)
+}
+
+fn (mut s ReaderScanner) num_scan(line int, col int) !Token {
+	mut is_fl := false
+	mut dot_index := -1
+	mut digits := []u8{}
+	if s.peek_byte()! == `-` {
+		digits << `-`
+		_ = s.read_byte()!
+		if !s.has_next_byte()! {
+			return new_scan_error('invalid token `-`', line, col)
+		}
+		next := s.ch
+		if !next.is_digit() {
+			return new_scan_error(invalid_token_description(next), s.line, s.col)
+		}
+	}
+	if s.has_next_byte()! {
+		first := s.ch
+		if first == `0` {
+			digits << first
+			_ = s.read_byte()!
+			if s.has_next_byte()! && s.ch.is_digit() {
+				return new_scan_error('leading zeroes in a number are not allowed', line,
+					col)
+			}
+		}
+	}
+	for {
+		if !s.has_next_byte()! {
+			break
+		}
+		ch := s.ch
+		if ch.is_digit() || (!is_fl && ch == `.`) {
+			digits << ch
+			if ch == `.` {
+				is_fl = true
+				dot_index = digits.len - 1
+			}
+			_ = s.read_byte()!
+			continue
+		}
+		break
+	}
+	if dot_index != -1 && digits[dot_index + 1..].len == 0 {
+		return new_scan_error('invalid float', line, col)
+	}
+	if s.has_next_byte()! {
+		ch := s.ch
+		if ch == `e` || ch == `E` {
+			digits << ch
+			_ = s.read_byte()!
+			if s.has_next_byte()! && s.ch in exp_signs {
+				digits << s.ch
+				_ = s.read_byte()!
+			}
+			mut exp_digits_count := 0
+			for {
+				if !s.has_next_byte()! {
+					break
+				}
+				digit := s.ch
+				if !digit.is_digit() {
+					break
+				}
+				digits << digit
+				exp_digits_count++
+				_ = s.read_byte()!
+			}
+			if exp_digits_count == 0 {
+				return new_scan_error('invalid exponent', line, col)
+			}
+		}
+	}
+	kind := if is_fl { TokenKind.float } else { TokenKind.int }
+	return s.tokenize(digits, kind, line, col)
+}
+
+// next returns the next JSON token from the reader-backed scanner.
+pub fn (mut s ReaderScanner) next() !Token {
+	s.skip_whitespace()!
+	line, col := s.line, s.col
+	if !s.has_next_byte()! {
+		return s.tokenize([]u8{}, .eof, line, col)
+	}
+	ch := s.ch
+	if ch == `t` || ch == `n` {
+		ident := if ch == `t` { 'true' } else { 'null' }
+		kind := if ch == `t` { TokenKind.bool } else { TokenKind.null }
+		return s.scan_ident(ident, kind, line, col)
+	} else if ch == `f` {
+		return s.scan_ident('false', .bool, line, col)
+	} else if ch in char_list {
+		_ = s.read_byte()!
+		return s.tokenize([]u8{}, unsafe { TokenKind(int(ch)) }, line, col)
+	} else if ch == `"` {
+		return s.text_scan(line, col)
+	} else if ch.is_digit() || ch == `-` {
+		return s.num_scan(line, col)
+	}
+	return new_scan_error(invalid_token_description(ch), line, col)
+}
diff --git a/vlib/x/json2/tests/scanner_test.v b/vlib/x/json2/tests/scanner_test.v
new file mode 100644
index 000000000..5e94446c8
--- /dev/null
+++ b/vlib/x/json2/tests/scanner_test.v
@@ -0,0 +1,102 @@
+import io
+import x.json2 as json
+
+struct ChunkedReader {
+	data       []u8
+	chunk_size int
+mut:
+	pos int
+}
+
+fn (mut r ChunkedReader) read(mut buf []u8) !int {
+	if r.pos >= r.data.len {
+		return io.Eof{}
+	}
+	mut n := r.chunk_size
+	remaining := r.data.len - r.pos
+	if n > remaining {
+		n = remaining
+	}
+	if n > buf.len {
+		n = buf.len
+	}
+	read := copy(mut buf[..n], r.data[r.pos..r.pos + n])
+	r.pos += read
+	return read
+}
+
+fn test_public_scanner_iterates_tokens() {
+	mut scanner := json.new_scanner('{"items":[1,true,null,"x"]}')
+	mut got := []string{}
+	for {
+		token := scanner.next()!
+		got << '${token.kind}:${token.literal()}'
+		if token.is_eof() {
+			break
+		}
+	}
+	assert got == [
+		'lcbr:',
+		'str:items',
+		'colon:',
+		'lsbr:',
+		'int:1',
+		'comma:',
+		'bool:true',
+		'comma:',
+		'null:null',
+		'comma:',
+		'str:x',
+		'rsbr:',
+		'rcbr:',
+		'eof:',
+	]
+}
+
+fn test_reader_scanner_iterates_tokens() {
+	mut reader := ChunkedReader{
+		data:       '[10,true,"chunked",null]'.bytes()
+		chunk_size: 2
+	}
+	mut scanner := json.new_reader_scanner(reader: reader, buffer_size: 3)
+	defer {
+		scanner.free()
+	}
+	mut got := []string{}
+	for {
+		token := scanner.next()!
+		got << '${token.kind}:${token.literal()}'
+		if token.is_eof() {
+			break
+		}
+	}
+	assert got == [
+		'lsbr:',
+		'int:10',
+		'comma:',
+		'bool:true',
+		'comma:',
+		'str:chunked',
+		'comma:',
+		'null:null',
+		'rsbr:',
+		'eof:',
+	]
+}
+
+fn test_reader_scanner_reports_errors() {
+	mut reader := ChunkedReader{
+		data:       r'["\z"]'.bytes()
+		chunk_size: 1
+	}
+	mut scanner := json.new_reader_scanner(reader: reader, buffer_size: 1)
+	defer {
+		scanner.free()
+	}
+	assert scanner.next()!.kind == .lsbr
+	scanner.next() or {
+		assert err.msg().contains('invalid backslash escape')
+		return
+	}
+	assert false
+}
-- 
2.39.5