From 2f74099fd5025a7addcb910b2018d885d6720d9a Mon Sep 17 00:00:00 2001 From: JalonSolov Date: Fri, 5 Jun 2026 13:45:29 -0400 Subject: [PATCH] regex.pcre: small fixes (#27341) --- vlib/regex/pcre/README.md | 11 +++--- vlib/regex/pcre/regex.v | 49 +++++++++++++++++++++++- vlib/regex/pcre/regex_test.v | 72 ++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 7 deletions(-) diff --git a/vlib/regex/pcre/README.md b/vlib/regex/pcre/README.md index e388d1b74..4b04e7221 100644 --- a/vlib/regex/pcre/README.md +++ b/vlib/regex/pcre/README.md @@ -100,12 +100,13 @@ Supports backreferences like `$1`, `$2`. fn (r Regex) replace(text string, repl string) string ``` -### `change_stack_depth` -Updates the maximum backtracking depth for the VM. -Default is 1024. -Use this if your pattern is extremely complex and returns `none` prematurely. +### `max_stack_depth` (configuration field) +Controls the maximum backtracking depth for the VM. +Default is `2048`. Increase this value if complex patterns return `none` prematurely due to +deep backtracking; decrease it to limit memory usage. ```v ignore -fn (mut r Regex) change_stack_depth(depth int) +r := pcre.compile(pattern)! +r.max_stack_depth = 4096 ``` --- diff --git a/vlib/regex/pcre/regex.v b/vlib/regex/pcre/regex.v index 0312e8502..56b2dd99f 100644 --- a/vlib/regex/pcre/regex.v +++ b/vlib/regex/pcre/regex.v @@ -57,6 +57,7 @@ Key Architectural Features and Optimizations: module pcre +import strconv import strings /****************************************************************************** @@ -664,6 +665,9 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta return error('Unclosed named group') } name := pattern[pos..end] + if name in group_map { + return error('Duplicate named group: ${name}') + } idx = group_counter group_map[name] = idx pos = end + 1 @@ -774,6 +778,38 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta typ: .uppercase_char } } + `x` { + // \xHH - two hex digits decode to a character + if pos + 2 > pattern.len { + return error('\\x requires exactly 2 hex digits') + } + hex_str := pattern[pos..pos + 2] + val := strconv.parse_uint(hex_str, 16, 32) or { + return error('Invalid hex escape \\x${hex_str}') + } + pos += 2 + parsed_nodes << Node{ + typ: .chr + chr: rune(val) + ignore_case: current_flags.ignore_case + } + } + `X` { + // \XHHHH - four hex digits decode to a Unicode codepoint + if pos + 4 > pattern.len { + return error('\\X requires exactly 4 hex digits') + } + hex_str := pattern[pos..pos + 4] + val := strconv.parse_uint(hex_str, 16, 32) or { + return error('Invalid hex escape \\X${hex_str}') + } + pos += 4 + parsed_nodes << Node{ + typ: .chr + chr: rune(val) + ignore_case: current_flags.ignore_case + } + } else { parsed_nodes << Node{ typ: .chr @@ -822,6 +858,9 @@ fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_sta } else { min } + if min < 0 || (max != -1 && max < min) { + return error('Invalid quantifier range {${min},${max}}') + } q = Quantifier{min, max, true} pos = end + 1 } @@ -1027,7 +1066,7 @@ fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match { .split { if stack_ptr + frame_size >= stack_max { new_size := stack_max * 2 - if new_size > 1_000_000 { + if new_size > r.max_stack_depth { goto backtrack } m.stack.grow_len(new_size) @@ -1183,7 +1222,13 @@ pub fn (r &Regex) find_all(text string) []Match { } if res := r.vm_match(text, i, mut m) { matches << res - i = if res.end > i { res.end } else { i + 1 } + if res.end > i { + i = res.end + } else { + // Empty match: advance by one full rune to avoid infinite loop + _, rune_len := read_rune_at(text.str, text.len, i) + i += if rune_len > 0 { rune_len } else { 1 } + } } else { i++ } diff --git a/vlib/regex/pcre/regex_test.v b/vlib/regex/pcre/regex_test.v index 085052c9d..680ba7e33 100644 --- a/vlib/regex/pcre/regex_test.v +++ b/vlib/regex/pcre/regex_test.v @@ -635,3 +635,75 @@ fn test_compatibility_layer() { assert false, 'match_str should return none when no match is found' } } + +fn test_hex_escapes() { + // \xHH — two hex digits + tst_find(r'\x41', 'ABC', 'A') // 0x41 = 'A' + tst_find(r'\x61', 'abc', 'a') // 0x61 = 'a' + tst_find(r'\x41+', 'AAAB', 'AAA') + tst_find(r'\x20\x41', ' A test', ' A') // space + 'A' + + // \XHHHH — four hex digits (Unicode codepoint) + tst_find(r'\X0041', 'ABC', 'A') // U+0041 = 'A' + tst_find(r'\X0061', 'abc', 'a') // U+0061 = 'a' + tst_find(r'\X03B1', 'αβγ', 'α') // U+03B1 = 'α' + + // Mix with other escapes + tst_find(r'\x48\x65\x6C\x6C\x6F', 'Hello World', 'Hello') // \x48\x65\x6C\x6C\x6F = "Hello" + + // Invalid hex escape compile errors + tst_compile_error(r'\x4') // only 1 digit + tst_compile_error(r'\xGG') // invalid hex chars + tst_compile_error(r'\X004') // only 3 digits +} + +fn test_duplicate_named_groups() { + // Compile error: same name used twice + tst_compile_error(r'(?P\d+)-(?P\w+)') + // Different names are fine + r := pcre.compile(r'(?P\d+)-(?P\w+)') or { + assert false, 'Should compile: ${err}' + return + } + m := r.find('12-abc') or { + assert false, 'Should match' + return + } + assert r.group_by_name(m, 'a') == '12' + assert r.group_by_name(m, 'b') == 'abc' +} + +fn test_invalid_quantifier_ranges() { + // min > max is an error + tst_compile_error(r'a{3,1}') + tst_compile_error(r'a{5,2}') + // negative min-like patterns (parsed as 0) + // {0,0} should compile and match empty string + r := pcre.compile(r'a{0,0}b') or { + assert false, 'Should compile: ${err}' + return + } + m := r.find('b') or { + assert false, 'Should match' + return + } + assert m.text == 'b' +} + +fn test_find_all_utf8_safety() { + // find_all with an empty-matching pattern must not get stuck inside a multi-byte rune + r := pcre.compile(r'x*') or { panic(err) } + matches := r.find_all('aé') // 'é' is 2 bytes (0xC3 0xA9) + // Every result start/end must align on a rune boundary + for m in matches { + text_bytes := 'aé'.bytes() + if m.start < text_bytes.len { + // byte at start must not be a UTF-8 continuation byte + assert (text_bytes[m.start] & 0xC0) != 0x80, 'Misaligned match start at ${m.start}' + } + } + // find_all should not infinite-loop on emoji + r2 := pcre.compile(r'y*') or { panic(err) } + matches2 := r2.find_all('😀!') + assert matches2.len > 0 +} -- 2.39.5