From 2f74099fd5025a7addcb910b2018d885d6720d9a Mon Sep 17 00:00:00 2001
From: JalonSolov <JalonSolov@gmail.com>
Date: Fri, 5 Jun 2026 13:45:29 -0400
Subject: [PATCH] regex.pcre: small fixes (#27341)

---
 vlib/regex/pcre/README.md    | 11 +++---
 vlib/regex/pcre/regex.v      | 49 +++++++++++++++++++++++-
 vlib/regex/pcre/regex_test.v | 72 ++++++++++++++++++++++++++++++++++++
 3 files changed, 125 insertions(+), 7 deletions(-)

diff --git a/vlib/regex/pcre/README.md b/vlib/regex/pcre/README.md
index e388d1b74..4b04e7221 100644
--- a/vlib/regex/pcre/README.md
+++ b/vlib/regex/pcre/README.md
@@ -100,12 +100,13 @@ Supports backreferences like `$1`, `$2`.
 fn (r Regex) replace(text string, repl string) string
 ```
 
-### `change_stack_depth`
-Updates the maximum backtracking depth for the VM.
-Default is 1024.
-Use this if your pattern is extremely complex and returns `none` prematurely.
+### `max_stack_depth` (configuration field)
+Controls the maximum backtracking depth for the VM.
+Default is `2048`. Increase this value if complex patterns return `none` prematurely due to
+deep backtracking; decrease it to limit memory usage.
 ```v ignore
-fn (mut r Regex) change_stack_depth(depth int)
+r := pcre.compile(pattern)!
+r.max_stack_depth = 4096
 ```
 
 ---
diff --git a/vlib/regex/pcre/regex.v b/vlib/regex/pcre/regex.v
index 0312e8502..56b2dd99f 100644
--- a/vlib/regex/pcre/regex.v
+++ b/vlib/regex/pcre/regex.v
@@ -57,6 +57,7 @@ Key Architectural Features and Optimizations:
 
 module pcre
 
+import strconv
 import strings
 
 /******************************************************************************
@@ -664,6 +665,9 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
 							return error('Unclosed named group')
 						}
 						name := pattern[pos..end]
+						if name in group_map {
+							return error('Duplicate named group: ${name}')
+						}
 						idx = group_counter
 						group_map[name] = idx
 						pos = end + 1
@@ -774,6 +778,38 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
 							typ: .uppercase_char
 						}
 					}
+					`x` {
+						// \xHH - two hex digits decode to a character
+						if pos + 2 > pattern.len {
+							return error('\\x requires exactly 2 hex digits')
+						}
+						hex_str := pattern[pos..pos + 2]
+						val := strconv.parse_uint(hex_str, 16, 32) or {
+							return error('Invalid hex escape \\x${hex_str}')
+						}
+						pos += 2
+						parsed_nodes << Node{
+							typ:         .chr
+							chr:         rune(val)
+							ignore_case: current_flags.ignore_case
+						}
+					}
+					`X` {
+						// \XHHHH - four hex digits decode to a Unicode codepoint
+						if pos + 4 > pattern.len {
+							return error('\\X requires exactly 4 hex digits')
+						}
+						hex_str := pattern[pos..pos + 4]
+						val := strconv.parse_uint(hex_str, 16, 32) or {
+							return error('Invalid hex escape \\X${hex_str}')
+						}
+						pos += 4
+						parsed_nodes << Node{
+							typ:         .chr
+							chr:         rune(val)
+							ignore_case: current_flags.ignore_case
+						}
+					}
 					else {
 						parsed_nodes << Node{
 							typ:         .chr
@@ -822,6 +858,9 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta
 						} else {
 							min
 						}
+						if min < 0 || (max != -1 && max < min) {
+							return error('Invalid quantifier range {${min},${max}}')
+						}
 						q = Quantifier{min, max, true}
 						pos = end + 1
 					}
@@ -1027,7 +1066,7 @@ fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match {
 				.split {
 					if stack_ptr + frame_size >= stack_max {
 						new_size := stack_max * 2
-						if new_size > 1_000_000 {
+						if new_size > r.max_stack_depth {
 							goto backtrack
 						}
 						m.stack.grow_len(new_size)
@@ -1183,7 +1222,13 @@ pub fn (r &Regex) find_all(text string) []Match {
 		}
 		if res := r.vm_match(text, i, mut m) {
 			matches << res
-			i = if res.end > i { res.end } else { i + 1 }
+			if res.end > i {
+				i = res.end
+			} else {
+				// Empty match: advance by one full rune to avoid infinite loop
+				_, rune_len := read_rune_at(text.str, text.len, i)
+				i += if rune_len > 0 { rune_len } else { 1 }
+			}
 		} else {
 			i++
 		}
diff --git a/vlib/regex/pcre/regex_test.v b/vlib/regex/pcre/regex_test.v
index 085052c9d..680ba7e33 100644
--- a/vlib/regex/pcre/regex_test.v
+++ b/vlib/regex/pcre/regex_test.v
@@ -635,3 +635,75 @@ fn test_compatibility_layer() {
 		assert false, 'match_str should return none when no match is found'
 	}
 }
+
+fn test_hex_escapes() {
+	// \xHH — two hex digits
+	tst_find(r'\x41', 'ABC', 'A') // 0x41 = 'A'
+	tst_find(r'\x61', 'abc', 'a') // 0x61 = 'a'
+	tst_find(r'\x41+', 'AAAB', 'AAA')
+	tst_find(r'\x20\x41', ' A test', ' A') // space + 'A'
+
+	// \XHHHH — four hex digits (Unicode codepoint)
+	tst_find(r'\X0041', 'ABC', 'A') // U+0041 = 'A'
+	tst_find(r'\X0061', 'abc', 'a') // U+0061 = 'a'
+	tst_find(r'\X03B1', 'αβγ', 'α') // U+03B1 = 'α'
+
+	// Mix with other escapes
+	tst_find(r'\x48\x65\x6C\x6C\x6F', 'Hello World', 'Hello') // \x48\x65\x6C\x6C\x6F = "Hello"
+
+	// Invalid hex escape compile errors
+	tst_compile_error(r'\x4') // only 1 digit
+	tst_compile_error(r'\xGG') // invalid hex chars
+	tst_compile_error(r'\X004') // only 3 digits
+}
+
+fn test_duplicate_named_groups() {
+	// Compile error: same name used twice
+	tst_compile_error(r'(?P<id>\d+)-(?P<id>\w+)')
+	// Different names are fine
+	r := pcre.compile(r'(?P<a>\d+)-(?P<b>\w+)') or {
+		assert false, 'Should compile: ${err}'
+		return
+	}
+	m := r.find('12-abc') or {
+		assert false, 'Should match'
+		return
+	}
+	assert r.group_by_name(m, 'a') == '12'
+	assert r.group_by_name(m, 'b') == 'abc'
+}
+
+fn test_invalid_quantifier_ranges() {
+	// min > max is an error
+	tst_compile_error(r'a{3,1}')
+	tst_compile_error(r'a{5,2}')
+	// negative min-like patterns (parsed as 0)
+	// {0,0} should compile and match empty string
+	r := pcre.compile(r'a{0,0}b') or {
+		assert false, 'Should compile: ${err}'
+		return
+	}
+	m := r.find('b') or {
+		assert false, 'Should match'
+		return
+	}
+	assert m.text == 'b'
+}
+
+fn test_find_all_utf8_safety() {
+	// find_all with an empty-matching pattern must not get stuck inside a multi-byte rune
+	r := pcre.compile(r'x*') or { panic(err) }
+	matches := r.find_all('aé') // 'é' is 2 bytes (0xC3 0xA9)
+	// Every result start/end must align on a rune boundary
+	for m in matches {
+		text_bytes := 'aé'.bytes()
+		if m.start < text_bytes.len {
+			// byte at start must not be a UTF-8 continuation byte
+			assert (text_bytes[m.start] & 0xC0) != 0x80, 'Misaligned match start at ${m.start}'
+		}
+	}
+	// find_all should not infinite-loop on emoji
+	r2 := pcre.compile(r'y*') or { panic(err) }
+	matches2 := r2.find_all('😀!')
+	assert matches2.len > 0
+}
-- 
2.39.5