// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module json2

import io
import strconv

// JsonScanError describes a tokenization error reported by the iterative scanner APIs.
pub struct JsonScanError {
	Error
pub:
	message string

	line      int
	character int
}

fn (e JsonScanError) msg() string {
	return '${e.line}:${e.character}: Invalid json token: ${e.message}'
}

// Scanner tokenizes JSON from an in-memory string or byte slice.
pub struct Scanner {
mut:
	text []u8
	pos  int // the position of the token in scanner text
	line int = 1
	col  int = 1
}

// ReaderScanner tokenizes JSON incrementally from any io.Reader.
pub struct ReaderScanner {
mut:
	reader &io.BufferedReader
	peeked bool
	ch     u8
	line   int = 1
	col    int = 1
}

// ReaderScannerConfig configures a reader-backed JSON scanner.
@[params]
pub struct ReaderScannerConfig {
pub:
	reader      io.Reader
	buffer_size int = 128 * 1024
}

// TokenKind identifies the kind of a JSON token.
pub enum TokenKind {
	none
	error
	str
	float
	int
	null
	bool
	eof
	comma = 44  // ,
	colon = 58  // :
	lsbr  = 91  // [
	rsbr  = 93  // ]
	lcbr  = 123 // {
	rcbr  = 125 // }
}

// new_scanner creates an iterative scanner for an in-memory JSON string.
pub fn new_scanner(text string) Scanner {
	return Scanner{
		text: text.bytes()
		line: 1
		col:  1
	}
}

// new_scanner_from_bytes creates an iterative scanner for an in-memory JSON byte slice.
pub fn new_scanner_from_bytes(text []u8) Scanner {
	return Scanner{
		text: text
		line: 1
		col:  1
	}
}

// new_reader_scanner creates an iterative scanner that reads JSON tokens from an io.Reader.
pub fn new_reader_scanner(config ReaderScannerConfig) &ReaderScanner {
	return &ReaderScanner{
		reader: io.new_buffered_reader(reader: config.reader, cap: config.buffer_size)
		line:   1
		col:    1
	}
}

// free releases the reader scanner's internal buffer.
pub fn (mut s ReaderScanner) free() {
	s.reader.free()
}

pub struct Token {
pub:
	lit  []u8      // literal representation of the token
	kind TokenKind // the token number/enum; for quick comparisons
	line int       // the line in the source where the token occurred
	col  int       // the column in the source where the token occurred
}

// literal returns the token contents as a string.
pub fn (t Token) literal() string {
	return t.lit.bytestr()
}

// full_col returns the full column information which includes the length.
pub fn (t Token) full_col() int {
	return t.col + t.lit.len
}

// is_eof reports whether the token marks the end of the JSON stream.
pub fn (t Token) is_eof() bool {
	return t.kind == .eof
}

// list of characters commonly used in JSON.
const char_list = [`{`, `}`, `[`, `]`, `,`, `:`]!
// list of newlines to check when moving to a new position.
const newlines = [`\r`, `\n`, `\t`]!
// list of escapable that needs to be escaped inside a JSON string.
// double quotes and forward slashes are excluded intentionally since
// they have their own separate checks for it in order to pass the
// JSON test suite (https://github.com/nst/JSONTestSuite/).
const important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]!
// list of valid unicode escapes aside from \u{4-hex digits}
const valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]!
// used for transforming escapes into valid unicode (eg. n => \n)
const unicode_transform_escapes = {
	98:  `\b`
	102: `\f`
	110: `\n`
	114: `\r`
	116: `\t`
	92:  `\\`
	34:  `"`
	47:  `/`
}
const exp_signs = [u8(`-`), `+`]!

fn new_scan_error(message string, line int, col int) JsonScanError {
	return JsonScanError{
		message:   message
		line:      line
		character: col
	}
}

fn token_to_scan_error(token Token) JsonScanError {
	return new_scan_error(token.literal(), token.line, token.col)
}

fn important_escapable_char(ch u8) ?u8 {
	return match ch {
		`\b` { `b` }
		`\f` { `f` }
		`\n` { `n` }
		`\r` { `r` }
		`\t` { `t` }
		else { none }
	}
}

fn invalid_token_description(ch u8) string {
	if ch >= 32 && ch <= 126 {
		x := ch.ascii_str()
		return 'invalid token `${x}`'
	} else {
		x := ch.str_escaped()
		return 'invalid token `${x}`'
	}
}

// move_pos proceeds to the next position.
fn (mut s Scanner) move() {
	s.move_pos(true, true)
}

// move_pos_with_newlines is the same as move_pos but only enables newline checking.
fn (mut s Scanner) move_pos_with_newlines() {
	s.move_pos(false, true)
}

fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
	s.pos++
	if s.pos < s.text.len {
		if include_newlines && s.text[s.pos] in newlines {
			s.line++
			s.col = 0
			if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
				s.pos++
			}
			for s.pos < s.text.len && s.text[s.pos] in newlines {
				s.move()
			}
		} else if include_space && s.text[s.pos] == ` ` {
			s.pos++
			s.col++
			for s.pos < s.text.len && s.text[s.pos] == ` ` {
				s.move()
			}
		}
	} else {
		s.col++
	}
}

// error returns an error token.
fn (s &Scanner) error(description string) Token {
	return s.tokenize(description.bytes(), .error)
}

// tokenize returns a token based on the given lit and kind.
fn (s &Scanner) tokenize(lit []u8, kind TokenKind) Token {
	return Token{
		lit:  lit
		kind: kind
		col:  s.col
		line: s.line
	}
}

// text_scan scans and returns a string token.
@[manualfree]
fn (mut s Scanner) text_scan() Token {
	mut has_closed := false
	mut chrs := []u8{}
	for {
		s.pos++
		s.col++
		if s.pos >= s.text.len {
			break
		}
		ch := s.text[s.pos]
		if ch == `"` {
			has_closed = true
			break
		} else if escaped := important_escapable_char(ch) {
			return s.error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}')
		} else if ch < 0x20 {
			return s.error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}')
		} else if ch == `\\` {
			if s.pos == s.text.len - 1 {
				return s.error('incomplete backslash escape at end of JSON input')
			}

			peek := s.text[s.pos + 1]
			if peek in valid_unicode_escapes {
				chrs << unicode_transform_escapes[int(peek)]
				s.pos++
				s.col++
				continue
			} else if peek == `u` {
				if s.pos + 5 < s.text.len {
					s.pos++
					s.col++
					mut codepoint := []u8{}
					codepoint_start := s.pos
					for s.pos < s.text.len && s.pos < codepoint_start + 4 {
						s.pos++
						s.col++
						if s.text[s.pos] == `"` {
							break
						} else if !s.text[s.pos].is_hex_digit() {
							x := s.text[s.pos].ascii_str()
							return s.error('`${x}` is not a hex digit')
						}
						codepoint << s.text[s.pos]
					}
					if codepoint.len != 4 {
						return s.error('unicode escape must have 4 hex digits')
					}
					val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
					converted := utf32_to_str(val)
					converted_bytes := converted.bytes()
					chrs << converted_bytes
					unsafe {
						converted.free()
						converted_bytes.free()
						codepoint.free()
					}
					continue
				} else {
					return s.error('incomplete unicode escape')
				}
			} else if peek == `U` {
				return s.error('unicode endpoints must be in lowercase `u`')
			} else if peek == u8(229) {
				return s.error('unicode endpoint not allowed')
			} else {
				return s.error('invalid backslash escape')
			}
		}
		chrs << ch
	}
	tok := s.tokenize(chrs, .str)
	s.move()
	if !has_closed {
		return s.error('missing double quotes in string closing')
	}
	return tok
}

// num_scan scans and returns an int/float token.
fn (mut s Scanner) num_scan() Token {
	// analyze json number structure
	// -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
	mut is_fl := false
	mut dot_index := -1
	mut digits := []u8{}
	if s.text[s.pos] == `-` {
		digits << `-`
		if s.pos + 1 >= s.text.len || !s.text[s.pos + 1].is_digit() {
			return s.invalid_token()
		}
		s.move_pos_with_newlines()
	}
	if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
		return s.error('leading zeroes in a number are not allowed')
	}
	for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
		digits << s.text[s.pos]
		if s.text[s.pos] == `.` {
			is_fl = true
			dot_index = digits.len - 1
		}
		s.move_pos_with_newlines()
	}
	if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
		return s.error('invalid float')
	}
	if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
		digits << s.text[s.pos]
		s.move_pos_with_newlines()
		if s.pos < s.text.len && s.text[s.pos] in exp_signs {
			digits << s.text[s.pos]
			s.move_pos_with_newlines()
		}
		mut exp_digits_count := 0
		for s.pos < s.text.len && s.text[s.pos].is_digit() {
			digits << s.text[s.pos]
			exp_digits_count++
			s.move_pos_with_newlines()
		}
		if exp_digits_count == 0 {
			return s.error('invalid exponent')
		}
	}
	kind := if is_fl { TokenKind.float } else { TokenKind.int }
	return s.tokenize(digits, kind)
}

// invalid_token returns an error token with the invalid token message.
fn (s &Scanner) invalid_token() Token {
	return s.error(invalid_token_description(s.text[s.pos]))
}

// next returns the next JSON token from the in-memory scanner.
pub fn (mut s Scanner) next() !Token {
	tok := s.scan()
	if tok.kind == .error {
		return token_to_scan_error(tok)
	}
	return tok
}

// scan returns a token based on the scanner's current position.
// used to set the next token
@[manualfree]
fn (mut s Scanner) scan() Token {
	if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in newlines) {
		s.move()
	}
	if s.pos >= s.text.len {
		return s.tokenize([]u8{}, .eof)
	} else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
		ident := s.text[s.pos..s.pos + 4].bytestr()
		if ident == 'true' || ident == 'null' {
			mut kind := TokenKind.null
			if ident == 'true' {
				kind = .bool
			}
			unsafe { ident.free() }
			val := s.text[s.pos..s.pos + 4]
			tok := s.tokenize(val, kind)
			s.move() // n / t
			s.move() // u / r
			s.move() // l / u
			s.move() // l / e
			return tok
		}
		unsafe { ident.free() }
		return s.invalid_token()
	} else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
		ident := s.text[s.pos..s.pos + 5].bytestr()
		if ident == 'false' {
			unsafe { ident.free() }
			val := s.text[s.pos..s.pos + 5]
			tok := s.tokenize(val, .bool)
			s.move() // f
			s.move() // a
			s.move() // l
			s.move() // s
			s.move() // e
			return tok
		}
		unsafe { ident.free() }
		return s.invalid_token()
	} else if s.text[s.pos] in char_list {
		chr := s.text[s.pos]
		tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) })
		s.move()
		return tok
	} else if s.text[s.pos] == `"` {
		return s.text_scan()
	} else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
		return s.num_scan()
	} else {
		return s.invalid_token()
	}
}

fn (mut s ReaderScanner) tokenize(lit []u8, kind TokenKind, line int, col int) Token {
	return Token{
		lit:  lit
		kind: kind
		line: line
		col:  col
	}
}

fn (mut s ReaderScanner) has_next_byte() !bool {
	if s.peeked {
		return true
	}
	mut buf := [u8(0)]
	n := s.reader.read(mut buf) or {
		if err is io.Eof {
			return false
		}
		return err
	}
	if n == 0 {
		return false
	}
	s.ch = buf[0]
	s.peeked = true
	return true
}

fn (mut s ReaderScanner) peek_byte() !u8 {
	if !s.has_next_byte()! {
		return io.Eof{}
	}
	return s.ch
}

fn (mut s ReaderScanner) advance_position(ch u8) ! {
	if ch == `\r` {
		if s.has_next_byte()! && s.ch == `\n` {
			s.peeked = false
		}
	}
	if ch in newlines {
		s.line++
		s.col = 1
		return
	}
	s.col++
}

fn (mut s ReaderScanner) read_byte() !u8 {
	ch := s.peek_byte()!
	s.peeked = false
	s.advance_position(ch)!
	return ch
}

fn (mut s ReaderScanner) skip_whitespace() ! {
	for {
		if !s.has_next_byte()! {
			return
		}
		ch := s.ch
		if ch == ` ` || ch in newlines {
			_ = s.read_byte()!
			continue
		}
		return
	}
}

fn (mut s ReaderScanner) scan_ident(ident string, kind TokenKind, line int, col int) !Token {
	mut lit := []u8{}
	for expected in ident.bytes() {
		current_line, current_col := s.line, s.col
		ch := s.read_byte() or {
			if err is io.Eof {
				return new_scan_error('unexpected end of JSON input', current_line, current_col)
			}
			return err
		}
		if ch != expected {
			return new_scan_error(invalid_token_description(ch), current_line, current_col)
		}
		lit << ch
	}
	return s.tokenize(lit, kind, line, col)
}

@[manualfree]
fn (mut s ReaderScanner) text_scan(line int, col int) !Token {
	mut chrs := []u8{}
	_ = s.read_byte()! // opening quote
	for {
		current_line, current_col := s.line, s.col
		if !s.has_next_byte()! {
			return new_scan_error('missing double quotes in string closing', line, col)
		}
		ch := s.ch
		if ch == `"` {
			_ = s.read_byte()!
			break
		} else if escaped := important_escapable_char(ch) {
			return new_scan_error('character must be escaped with a backslash, replace with: \\${escaped.ascii_str()}',
				current_line, current_col)
		} else if ch < 0x20 {
			return new_scan_error('character must be escaped with a unicode escape, replace with: \\u${ch:04x}',
				current_line, current_col)
		} else if ch == `\\` {
			_ = s.read_byte()!
			escape_line, escape_col := s.line, s.col
			if !s.has_next_byte()! {
				return new_scan_error('incomplete backslash escape at end of JSON input',
					escape_line, escape_col)
			}
			peek := s.ch
			if peek in valid_unicode_escapes {
				chrs << unicode_transform_escapes[int(peek)]
				_ = s.read_byte()!
				continue
			} else if peek == `u` {
				_ = s.read_byte()!
				mut codepoint := []u8{}
				for _ in 0 .. 4 {
					digit_line, digit_col := s.line, s.col
					if !s.has_next_byte()! {
						return new_scan_error('incomplete unicode escape', escape_line, escape_col)
					}
					digit := s.ch
					if digit == `"` {
						return new_scan_error('unicode escape must have 4 hex digits', digit_line,
							digit_col)
					} else if !digit.is_hex_digit() {
						x := digit.ascii_str()
						return new_scan_error('`${x}` is not a hex digit', digit_line, digit_col)
					}
					codepoint << digit
					_ = s.read_byte()!
				}
				val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
				converted := utf32_to_str(val)
				converted_bytes := converted.bytes()
				chrs << converted_bytes
				unsafe {
					converted.free()
					converted_bytes.free()
					codepoint.free()
				}
				continue
			} else if peek == `U` {
				return new_scan_error('unicode endpoints must be in lowercase `u`', escape_line,
					escape_col)
			} else if peek == u8(229) {
				return new_scan_error('unicode endpoint not allowed', escape_line, escape_col)
			} else {
				return new_scan_error('invalid backslash escape', escape_line, escape_col)
			}
		}
		chrs << ch
		_ = s.read_byte()!
	}
	return s.tokenize(chrs, .str, line, col)
}

fn (mut s ReaderScanner) num_scan(line int, col int) !Token {
	mut is_fl := false
	mut dot_index := -1
	mut digits := []u8{}
	if s.peek_byte()! == `-` {
		digits << `-`
		_ = s.read_byte()!
		if !s.has_next_byte()! {
			return new_scan_error('invalid token `-`', line, col)
		}
		next := s.ch
		if !next.is_digit() {
			return new_scan_error(invalid_token_description(next), s.line, s.col)
		}
	}
	if s.has_next_byte()! {
		first := s.ch
		if first == `0` {
			digits << first
			_ = s.read_byte()!
			if s.has_next_byte()! && s.ch.is_digit() {
				return new_scan_error('leading zeroes in a number are not allowed', line, col)
			}
		}
	}
	for {
		if !s.has_next_byte()! {
			break
		}
		ch := s.ch
		if ch.is_digit() || (!is_fl && ch == `.`) {
			digits << ch
			if ch == `.` {
				is_fl = true
				dot_index = digits.len - 1
			}
			_ = s.read_byte()!
			continue
		}
		break
	}
	if dot_index != -1 && digits[dot_index + 1..].len == 0 {
		return new_scan_error('invalid float', line, col)
	}
	if s.has_next_byte()! {
		ch := s.ch
		if ch == `e` || ch == `E` {
			digits << ch
			_ = s.read_byte()!
			if s.has_next_byte()! && s.ch in exp_signs {
				digits << s.ch
				_ = s.read_byte()!
			}
			mut exp_digits_count := 0
			for {
				if !s.has_next_byte()! {
					break
				}
				digit := s.ch
				if !digit.is_digit() {
					break
				}
				digits << digit
				exp_digits_count++
				_ = s.read_byte()!
			}
			if exp_digits_count == 0 {
				return new_scan_error('invalid exponent', line, col)
			}
		}
	}
	kind := if is_fl { TokenKind.float } else { TokenKind.int }
	return s.tokenize(digits, kind, line, col)
}

// next returns the next JSON token from the reader-backed scanner.
pub fn (mut s ReaderScanner) next() !Token {
	s.skip_whitespace()!
	line, col := s.line, s.col
	if !s.has_next_byte()! {
		return s.tokenize([]u8{}, .eof, line, col)
	}
	ch := s.ch
	if ch == `t` || ch == `n` {
		ident := if ch == `t` { 'true' } else { 'null' }
		kind := if ch == `t` { TokenKind.bool } else { TokenKind.null }
		return s.scan_ident(ident, kind, line, col)
	} else if ch == `f` {
		return s.scan_ident('false', .bool, line, col)
	} else if ch in char_list {
		_ = s.read_byte()!
		return s.tokenize([]u8{}, unsafe { TokenKind(int(ch)) }, line, col)
	} else if ch == `"` {
		return s.text_scan(line, col)
	} else if ch.is_digit() || ch == `-` {
		return s.num_scan(line, col)
	}
	return new_scan_error(invalid_token_description(ch), line, col)
}