From bc333ab16016e72ee835d92e0e4d9b8c67693d98 Mon Sep 17 00:00:00 2001
From: Mike <45243121+tankf33der@users.noreply.github.com>
Date: Sun, 16 Nov 2025 20:52:24 +0200
Subject: [PATCH] encoding.utf8.validate: fix validation, add test (#25748)

---
 .../utf8/validate/encoding_utf8_test.v        |   1 +
 vlib/encoding/utf8/validate/validate_utf8.v   | 118 +++++-------------
 2 files changed, 31 insertions(+), 88 deletions(-)

diff --git a/vlib/encoding/utf8/validate/encoding_utf8_test.v b/vlib/encoding/utf8/validate/encoding_utf8_test.v
index aa43ea813..326e0a910 100644
--- a/vlib/encoding/utf8/validate/encoding_utf8_test.v
+++ b/vlib/encoding/utf8/validate/encoding_utf8_test.v
@@ -84,4 +84,5 @@ fn test_validate_invalid_str() {
 	assert validate.utf8_string('\xF1\xBF\xBF\xC0') == false
 	assert validate.utf8_string('\xF1\xBF\xC0\x80') == false
 	assert validate.utf8_string('\xF1\xC0\x80\x80') == false
+	assert validate.utf8_string('\xED\xEF\xBF\x89') == false
 }
diff --git a/vlib/encoding/utf8/validate/validate_utf8.v b/vlib/encoding/utf8/validate/validate_utf8.v
index 428c0e065..b08bc2df6 100644
--- a/vlib/encoding/utf8/validate/validate_utf8.v
+++ b/vlib/encoding/utf8/validate/validate_utf8.v
@@ -1,11 +1,28 @@
 module validate
 
-struct Utf8State {
-mut:
-	index    int
-	subindex int
-	failed   bool
-}
+// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+
+// vfmt off
+const utf8d = [
+	u8(0), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+	7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+	7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 3, 3, 3, 3,
+	3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8,
+	8, 8, 8, 8, 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12,
+	0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12,
+	12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
+	24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12,
+	12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 12, 12,
+	12, 12, 12 ]!
+// vfmt on
 
 // utf8_string returns true, if the given string `s` consists only of valid UTF-8 runes
 pub fn utf8_string(s string) bool {
@@ -13,91 +30,16 @@ pub fn utf8_string(s string) bool {
 }
 
 // utf8_data returns true, if the given `data` block, with length `len` bytes, consists only of valid UTF-8 runes
+@[direct_array_access]
 pub fn utf8_data(data &u8, len int) bool {
-	mut state := Utf8State{}
+	mut state := 0
+
 	for i := 0; i < len; i++ {
-		s := unsafe { data[i] }
-		if s == 0 {
-			break
-		}
-		state.next_state(s)
-		if state.failed {
+		b := unsafe { data[i] }
+		state = utf8d[256 + state + utf8d[b]]
+		if state == 12 {
 			return false
 		}
 	}
-	return !state.failed && state.subindex <= 0
-}
-
-fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
-	if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) {
-		if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) {
-			s.subindex++
-			return true
-		}
-	} else {
-		s.failed = true
-		if is_tail {
-			s.index = 0
-			s.subindex = 0
-			s.failed = false
-		}
-		return true
-	}
-	s.index++
-	s.subindex = 0
-	return false
-}
-
-/* Check UTF-8 Byte sequences according to Unicode Standard
- * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/
- * Code Points        1st       2s       3s       4s
- * U+0000..U+007F     00..7F
- * U+0080..U+07FF     C2..DF   80..BF
- * U+0800..U+0FFF     E0       A0..BF   80..BF
- * U+1000..U+CFFF     E1..EC   80..BF   80..BF
- * U+D000..U+D7FF     ED       80..9F   80..BF
- * U+E000..U+FFFF     EE..EF   80..BF   80..BF
- * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
- * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
- * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
- */
-fn (mut s Utf8State) next_state(c u8) {
-	// sequence 1
-	if s.index == 0 {
-		if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 {
-			return
-		}
-		s.index++
-		s.subindex = 0
-	}
-	is_tail := c >= 0x80 && c <= 0xBF
-	// sequence 2
-	if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) {
-		return
-	}
-	// sequence 3
-	if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) {
-		return
-	}
-	if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) {
-		return
-	}
-	// sequence 4
-	if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) {
-		return
-	}
-	// we should never reach here
-	s.failed = true
+	return state == 0
 }
-- 
2.39.5