v / vlib / regex / pcre / regex_test.v
709 lines · 576 sloc · 21.18 KB · 2f74099fd5025a7addcb910b2018d885d6720d9a
Raw
1/*
2regex_test.v
3
4Copyright (c) 2026 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7*/
8import regex.pcre
9
10fn main() {
11 println('Running pcre tests...\n')
12
13 test_regex()
14 test_complex_quantifiers()
15 test_range_quantifiers()
16 test_anchors()
17 test_word_boundaries()
18 test_flags()
19 test_named_groups()
20 test_non_capturing_groups()
21
22 // New features tests
23 test_find_all()
24 test_find_from()
25 test_replace()
26 // test_stress_vm()
27
28 println('\nAll tests passed!')
29}
30
31// --- New Feature Tests ---
32
33fn test_find_all() {
34 println('\n--- Testing find_all() ---')
35
36 // Basic extraction
37 tst_find_all(r'\d+', '123 abc 456', ['123', '456'])
38 tst_find_all(r'\w+', 'hi there', ['hi', 'there'])
39
40 // No matches
41 tst_find_all(r'\d+', 'no numbers', [])
42
43 // Pattern matching empty strings (e.g., boundaries)
44 // Note: Behavior depends on engine implementation regarding empty matches.
45 // Current VM advances index if match length is 0 to avoid infinite loop.
46 // Pattern \b matches at 0 (start), 3 (after 123), 4 (before abc), 7 (after abc)
47 // But find_all usually returns non-overlapping text. \b returns empty string.
48 // tst_find_all(r'\b', '123 abc', ['', '', '', '']) // Commented out, specific implementation detail
49
50 // Anchored find_all (should only match once if anchored at start)
51 tst_find_all(r'^\w+', 'word word word', ['word'])
52
53 // Overlapping logic check (find_all is typically non-overlapping)
54 // "ana" in "banana". Indices: 1 ("ana"). Next search starts at 4 ("na").
55 tst_find_all(r'ana', 'banana', ['ana'])
56}
57
58fn test_find_from() {
59 println('\n--- Testing find_from() ---')
60
61 text := 'test test test'
62
63 // Start from 0 (finds first)
64 tst_find_from(r'test', text, 0, 0, 'test')
65
66 // Start from 1 (skips first, finds second)
67 tst_find_from(r'test', text, 1, 5, 'test')
68
69 // Start from 6 (finds third)
70 tst_find_from(r'test', text, 6, 10, 'test')
71
72 // Start from end (finds nothing)
73 tst_find_from(r'test', text, 11, -1, 'none')
74
75 // Out of bounds
76 tst_find_from(r'test', text, 50, -1, 'none')
77
78 // Start exactly at match position
79 tst_find_from(r'test', text, 5, 5, 'test')
80}
81
82fn test_replace() {
83 println('\n--- Testing replace() ---')
84
85 // Simple replacement
86 tst_replace(r'\d+', 'abc 123 def', 'NUM', 'abc NUM def')
87
88 // Group substitution
89 tst_replace(r'(\w+), (\w+)', 'Doe, John', '$2 $1', 'John Doe')
90
91 // Multiple replacements?
92 // The current replace() implementation in the provided code
93 // usually replaces the FIRST occurrence (based on find()).
94 // Let's verify:
95 tst_replace(r'a', 'bananas', 'o', 'bonanas')
96
97 tst_replace(r'(^[#.]+)|([#.]+$)', r'_#abc.#_ab#', '*', '_#abc.#_ab*')
98
99 // Invalid group index (should ignore or remove)
100 tst_replace(r'(\d+)', '123', 'Num: $9', 'Num: ')
101}
102
103/*
104fn test_stress_vm() {
105 println('\n--- Testing VM Stability (Stress Test) ---')
106 // Recursive engines often crash on patterns like (a*)* or very long strings
107 // if not carefully managed. The VM should handle this via heap stack.
108
109 long_text := 'a'.repeat(2000)
110 tst_find(r'a+', long_text, long_text)
111
112 println(' [Pass] Long string match')
113
114 // Backtracking stress
115 // Pattern: (a+)+b matching aaaaa....a (fails)
116 // This forces extensive backtracking.
117 short_text := 'a'.repeat(25)
118 mut r := pcre.compile(r'(a+)+b') or { panic(err) }
119 r.max_stack_depth = 4000 // increase the stack depth for this test
120 res := r.find(short_text)
121 assert res == none
122 println(' [Pass] Backtracking stress test')
123}
124*/
125
126// --- Existing Tests ---
127
128fn test_flags() {
129 println('\n--- Testing Flags ((?i), (?m), (?s)) ---')
130
131 // 1. Case Insensitive (?i)
132 tst_find('(?i)cat', 'Cat', 'Cat')
133 tst_find('(?i)CAT', 'cat', 'cat')
134 tst_find('(?i)[a-z]+', 'UPPER', 'UPPER') // char class expansion
135 tst_find('(?i)x', 'X', 'X')
136 tst_find('(?i)x', 'y', 'none')
137
138 // Mixed (flag applies to subsequent tokens)
139 tst_find('c(?i)at', 'cAT', 'cAT')
140 tst_find('c(?i)at', 'Cat', 'none') // first 'c' is case-sensitive
141
142 // 2. Multiline (?m)
143 // ^ matches start of line
144 tst_find('(?m)^line2', 'line1\nline2', 'line2')
145 tst_find('^line2', 'line1\nline2', 'none') // Default: matches only start of string
146
147 // $ matches end of line
148 tst_find('(?m)line1$', 'line1\nline2', 'line1')
149 tst_find('line1$', 'line1\nline2', 'none') // Default: matches only end of string
150
151 // 3. Dot-all / Singleline (?s)
152 // . matches newline
153 tst_find('(?s)a.b', 'a\nb', 'a\nb')
154 tst_find('a.b', 'a\nb', 'none') // Default: . does not match \n
155
156 // 4. Combined Flags (?im)
157 tst_find('(?im)^line2', 'LINE1\nLINE2', 'LINE2')
158
159 // --- Negative Tests (Flags) ---
160 tst_find('(?i)cat', 'dog', 'none')
161 tst_find('(?m)^line2', 'line1 line2', 'none') // Not at start of line
162}
163
164fn test_word_boundaries() {
165 println('\n--- Testing Word Boundaries (\\b and \\B) ---')
166
167 // 1. Word Boundary (\b)
168 tst_find('\\bcat', 'cat', 'cat')
169 tst_find('\\bcat', 'concat', 'none')
170 tst_find('\\bcat', 'catapult', 'cat')
171
172 tst_find('cat\\b', 'cat', 'cat')
173 tst_find('cat\\b', 'concat', 'cat')
174 tst_find('cat\\b', 'catapult', 'none')
175
176 tst_find('\\bcat\\b', 'cat', 'cat')
177 tst_find('\\bcat\\b', 'a cat is here', 'cat')
178 tst_find('\\bcat\\b', 'concat', 'none')
179 tst_find('\\bcat\\b', 'catapult', 'none')
180
181 tst_find('\\btest\\b', 'test.', 'test')
182 tst_find('\\btest\\b', '(test)', 'test')
183
184 // 2. Non-Word Boundary (\B)
185 tst_find('a\\B', 'ab', 'a')
186 tst_find('a\\B', 'a.', 'none')
187 tst_find('\\Bcat', 'concat', 'cat')
188 tst_find('\\Bcat', 'cat', 'none')
189 tst_find('cat\\B', 'catapult', 'cat')
190
191 // --- Negative Tests (Word Boundaries) ---
192 tst_find('\\b\\w+\\b', '... ...', 'none')
193 tst_find('\\B', 'a', 'none')
194}
195
196fn test_anchors() {
197 println('\n--- Testing Anchors (^ and $) ---')
198
199 // 1. Start of String (^)
200 tst_find('^abc', 'abc', 'abc')
201 tst_find('^abc', 'abcdef', 'abc')
202 tst_find('^abc', 'abc abc', 'abc')
203 tst_find('^\\d+', '123 text', '123')
204
205 // 2. End of String ($)
206 tst_find('xyz$', 'xyz', 'xyz')
207 tst_find('xyz$', 'abcxyz', 'xyz')
208 tst_find('\\d+$', 'text 123', '123')
209
210 // 3. Both Anchors (^...$)
211 tst_find('^hello$', 'hello', 'hello')
212
213 // 4. Zero-width matches
214 tst_find('^', 'abc', '')
215 tst_find('$', 'abc', '')
216
217 // 5. Anchors with Alternation
218 tst_find('^a|b$', 'apple', 'a')
219 tst_find('^a|b$', 'blob', 'b')
220
221 // 6. Anchors with Groups
222 tst_find('^(abc)+$', 'abcabc', 'abcabc')
223
224 // --- Negative Tests (Anchors) ---
225 tst_find('^abc', 'xyzabc', 'none')
226 tst_find('^\\d+', 'text 123', 'none')
227 tst_find('xyz$', 'xyzabc', 'none')
228 tst_find('\\d+$', '123 text', 'none')
229 tst_find('^hello$', 'hello world', 'none')
230 tst_find('^hello$', 'say hello', 'none')
231 tst_find('^a|b$', 'cba', 'none')
232 tst_find('^(abc)+$', 'abcabcx', 'none')
233 tst_find('^$', 'a', 'none')
234}
235
236fn test_regex() {
237 println('\n--- Testing Basic Features ---')
238 tst_find('a?b', 'ab', 'ab')
239 tst_find('a?b', 'b', 'b')
240 tst_find('a+b', 'aaab', 'aaab')
241 tst_find('a*b', 'b', 'b')
242 tst_find('\\d+', '123 abc', '123')
243
244 println('\n--- Testing Character Classes ---')
245 tst_find('\\w+', 'word1_ and', 'word1_')
246 tst_find('\\W+', ' and', ' ')
247 tst_find('\\s+', ' start', ' ')
248 tst_find('\\d{3}-\\d{4}', 'call 555-1234 now', '555-1234')
249 tst_find('\\D+', 'call 555', 'call ')
250 tst_find('\\a+', 'lowercase', 'lowercase')
251 tst_find('\\A+', 'UPPER', 'UPPER')
252
253 println('\n--- Testing Alternation (|) ---')
254 tst_find('cat|dog', 'the dog says meow', 'dog')
255 tst_find('a(b|c)d', 'acd', 'acd')
256 tst_find('apple|apply', 'I want to apply', 'apply')
257
258 println('\n--- Testing Custom Character Classes ([...]) ---')
259 tst_find('[aeiou]', 'hello world', 'e')
260 tst_find('gr[ae]y', 'the color grey', 'grey')
261 tst_find('[^aeiou]+', 'rhythm', 'rhythm')
262 tst_find('[a-z]+', 'lowercase123', 'lowercase')
263 tst_find('[a-zA-Z0-9_]+', 'word_1_with_everything', 'word_1_with_everything')
264
265 println('\n--- Testing Unicode ---')
266 tst_find('日本語', 'Text containing 日本語.', '日本語')
267 tst_find('h.llo', 'héllo wørld', 'héllo')
268 tst_find('(é)+', 'cafééé', 'ééé')
269 tst_find('😀+', 'Happy 😀😀 day', '😀😀')
270
271 println('\n--- Testing fullmatch() ---')
272 tst_fullmatch(r'\d+', '12345', '12345')
273 tst_fullmatch('(?s).*', 'Any content including 😀', 'Any content including 😀')
274
275 // --- Negative Tests (Basic) ---
276 tst_find('abc', 'ab', 'none')
277 tst_find('abc', 'acb', 'none')
278 tst_find('a+b', 'b', 'none')
279 tst_find('\\d+', 'abc', 'none')
280 tst_find('\\D+', '123', 'none')
281 tst_find('\\w+', '@#$', 'none')
282 tst_find('\\s+', 'Text', 'none')
283 tst_find('\\a+', 'UPPERCASE', 'none')
284 tst_find('\\A+', 'lowercase', 'none')
285 tst_find('cat|dog', 'bird', 'none')
286 tst_find('[0-9]', 'a', 'none')
287 tst_find('[^0-9]', '1', 'none')
288 tst_fullmatch(r'\d+', '12345abc', 'none')
289 tst_fullmatch(r'\d+', 'abc12345', 'none')
290
291 println('\n--- Testing Compilation Errors ---')
292 tst_compile_error('a++')
293 tst_compile_error('[a-z')
294 tst_compile_error('a|')
295}
296
297fn test_complex_quantifiers() {
298 println('\n--- Testing Complex Quantifiers (+, *, ?) ---')
299
300 tst_find('a+', 'aaaaa', 'aaaaa')
301 tst_find('a+b', 'aaaaab', 'aaaaab')
302
303 tst_find('x*y', 'y', 'y')
304 tst_find('x*y', 'xy', 'xy')
305 tst_find('x*y', 'xxxy', 'xxxy')
306
307 tst_find('colou?r', 'color', 'color')
308 tst_find('colou?r', 'colour', 'colour')
309 tst_find('x?y', 'xy', 'xy')
310 tst_find('x?y', 'y', 'y')
311
312 tst_find('(ab)+', 'ababab', 'ababab')
313 tst_find('(ha)+', 'hahaha!', 'hahaha')
314
315 tst_find('(cat|dog)+', 'catdogcat', 'catdogcat')
316 tst_find('(a|b)+', 'abaabbba', 'abaabbba')
317
318 tst_find('[0-9]+', 'Order 12345', '12345')
319 tst_find('[a-z]*', '123', '')
320
321 // --- Negative Tests (Complex Quantifiers) ---
322 tst_find('a+', '', 'none')
323 tst_find('a+', 'b', 'none')
324 tst_find('a+b', 'aaac', 'none')
325 tst_find('x?y', 'x', 'none')
326 tst_find('(ab)+', 'ac', 'none')
327 tst_find('[0-9]+', 'abc', 'none')
328}
329
330fn test_range_quantifiers() {
331 println('\n--- Testing Range Quantifiers {m,n} ---')
332
333 tst_find('a{3}', 'aaa', 'aaa')
334 tst_find('a{3}', 'aaaa', 'aaa')
335
336 tst_find('a{2,}', 'aa', 'aa')
337 tst_find('a{2,}', 'aaaaa', 'aaaaa')
338
339 tst_find('a{,3}', 'aaaa', 'aaa')
340 tst_find('a{,3}', 'aa', 'aa')
341 tst_find('a{,3}', '', '')
342
343 tst_find('a{2,4}', 'aa', 'aa')
344 tst_find('a{2,4}', 'aaa', 'aaa')
345 tst_find('a{2,4}', 'aaaa', 'aaaa')
346 tst_find('a{2,4}', 'aaaaa', 'aaaa')
347
348 tst_find(r'\d{2,4}-\w{2}', '123-ab', '123-ab')
349 tst_find(r'\d{2,4}-\w{2}', '12345-ab', '2345-ab')
350
351 // --- Negative Tests (Range Quantifiers) ---
352 tst_find('a{3}', 'aa', 'none')
353 tst_find('a{2,}', 'a', 'none')
354 tst_find('a{2,4}', 'a', 'none')
355 tst_find(r'\d{2,4}-\w{2}', '1-ab', 'none')
356 tst_find(r'\d{2,4}-\w{2}', '123-a', 'none')
357}
358
359fn test_named_groups() {
360 println('\n--- Testing Named Groups ---')
361
362 pattern := '(?P<year>\\d{4})-(?P<month>\\d{2})'
363 text := 'Date: 2025-01'
364 r := pcre.compile(pattern) or { panic(err) }
365 m := r.find(text) or { panic('Match not found') }
366
367 assert m.groups[0] == '2025'
368 assert m.groups[1] == '01'
369
370 assert r.group_by_name(m, 'year') == '2025'
371 assert r.group_by_name(m, 'month') == '01'
372 assert r.group_by_name(m, 'missing') == ''
373
374 nested_pat := '(?P<entry>key: (?P<val>\\d+))'
375 nested_txt := 'List [ key: 99 ]'
376 r_nested := pcre.compile(nested_pat) or { panic(err) }
377 m_nested := r_nested.find(nested_txt) or { panic('Match not found') }
378
379 println('Nested: entry="${r_nested.group_by_name(m_nested, 'entry')}", val="${r_nested.group_by_name(m_nested,
380 'val')}"')
381 assert r_nested.group_by_name(m_nested, 'entry') == 'key: 99'
382 assert r_nested.group_by_name(m_nested, 'val') == '99'
383
384 pattern_mixed := '(?P<key>\\w+): (\\d+)'
385 text_mixed := 'Price: 100'
386 r_mixed := pcre.compile(pattern_mixed) or { panic(err) }
387 m_mixed := r_mixed.find(text_mixed) or { panic('Match not found') }
388
389 assert m_mixed.groups[0] == 'Price'
390 assert m_mixed.groups[1] == '100'
391 assert r_mixed.group_by_name(m_mixed, 'key') == 'Price'
392
393 p_seq := '(?P<a>a)(?P<b>b)(?P<c>c)'
394 t_seq := 'abc'
395 r_seq := pcre.compile(p_seq) or { panic(err) }
396 m_seq := r_seq.find(t_seq) or { panic('Match not found') }
397 assert r_seq.group_by_name(m_seq, 'a') == 'a'
398 assert r_seq.group_by_name(m_seq, 'b') == 'b'
399 assert r_seq.group_by_name(m_seq, 'c') == 'c'
400
401 // --- Negative Tests (Named Groups) ---
402 if _ := r.find('Date: 99-01') {
403 assert false, 'Should not match'
404 } else {
405 println('Found: none (Expected: none)')
406 }
407 tst_find('(?P<id>\\d+)', 'abc', 'none')
408}
409
410fn test_non_capturing_groups() {
411 println('\n--- Testing Non-Capturing Groups ---')
412
413 tst_find_with_groups('(?:a|b)c', 'ac', 'ac', [])
414
415 tst_find_with_groups('(a)(?:b)(c)', 'abc', 'abc', ['a', 'c'])
416
417 tst_find_with_groups('(a(?:b)c)', 'abc', 'abc', ['abc'])
418
419 tst_find_with_groups('(?:header): (\\d+)', 'header: 123', 'header: 123', ['123'])
420
421 // --- Negative Tests (Non-Capturing Groups) ---
422 tst_find('(?:a|b)c', 'dc', 'none')
423 tst_find('(?:a)b', 'c', 'none')
424}
425
426// --- Helper Functions ---
427
428fn tst_find(pattern string, text string, expected string) {
429 print('[find] Pattern: "${pattern}", Text: "${text}" -> ')
430 r := pcre.compile(pattern) or {
431 println('Compile error: ${err}')
432 assert false, 'Unexpected compile error: ${err}'
433 return
434 }
435 match_res := r.find(text)
436 check_result(match_res, expected)
437}
438
439fn tst_find_all(pattern string, text string, expected []string) {
440 print('[find_all] Pattern: "${pattern}", Text: "${text}" -> ')
441 r := pcre.compile(pattern) or { panic(err) }
442 matches := r.find_all(text)
443
444 mut res_strs := []string{}
445 for m in matches {
446 res_strs << m.text
447 }
448
449 println('Found: ${res_strs}')
450 assert res_strs == expected
451}
452
453fn tst_find_from(pattern string, text string, start int, expected_pos int, expected_text string) {
454 print('[find_from] Pattern: "${pattern}", Start: ${start} -> ')
455 r := pcre.compile(pattern) or { panic(err) }
456 match_res := r.find_from(text, start)
457
458 if match_res != none {
459 println('Found: "${match_res.text}" at ${match_res.start}')
460 assert match_res.text == expected_text
461 assert match_res.start == expected_pos
462 } else {
463 println('Found: none')
464 assert expected_text == 'none'
465 }
466}
467
468fn tst_replace(pattern string, text string, repl string, expected string) {
469 print('[replace] Pattern: "${pattern}", Repl: "${repl}" -> ')
470 r := pcre.compile(pattern) or { panic(err) }
471 res := r.replace(text, repl)
472 println('Result: "${res}"')
473 assert res == expected
474}
475
476fn tst_fullmatch(pattern string, text string, expected string) {
477 print('[fullmatch] Pattern: "${pattern}", Text: "${text}" -> ')
478 r := pcre.compile(pattern) or {
479 println('Compile error: ${err}')
480 assert false, 'Unexpected compile error: ${err}'
481 return
482 }
483 match_res := r.fullmatch(text)
484 check_result(match_res, expected)
485}
486
487fn check_result(match_res ?pcre.Match, expected string) {
488 if match_res != none {
489 println('Found: "${match_res.text}" (Expected: "${expected}")')
490 assert match_res.text == expected
491 } else {
492 println('Found: none (Expected: "${expected}")')
493 assert expected == 'none'
494 }
495}
496
497fn tst_find_with_groups(pattern string, text string, expected_match string, expected_groups []string) {
498 print('[find+groups] Pattern: "${pattern}", Text: "${text}" -> ')
499 r := pcre.compile(pattern) or {
500 println('Compile error: ${err}')
501 assert false, 'Unexpected compile error: ${err}'
502 return
503 }
504 match_res := r.find(text)
505 if match_res != none {
506 println('Found: "${match_res.text}", Groups: ${match_res.groups}')
507 assert match_res.text == expected_match
508 assert match_res.groups == expected_groups
509 } else {
510 println('Found: none')
511 assert false // Should have found a match_res
512 }
513}
514
515fn tst_compile_error(pattern string) {
516 print('[compile_error] Pattern: "${pattern}" -> ')
517 _ := pcre.compile(pattern) or {
518 println('Caught expected error: ${err}')
519 return
520 }
521 println('Error: Did not get a compilation error!')
522 assert false
523}
524
525fn test_non_greedy_quantifiers() {
526 println('\n--- Testing Non-Greedy Quantifiers (*?, +?, ??, {m,n}?) ---')
527
528 // 1. Lazy Star (*?)
529 // Should stop at the first closing '>' (minimal match)
530 tst_find(r'<.*?>', '<div>content</div>', '<div>')
531 // Contrast with greedy (default) which consumes until the last '>'
532 tst_find(r'<.*>', '<div>content</div>', '<div>content</div>')
533
534 // 2. Lazy Plus (+?)
535 // Should match minimal characters (1 'a') to satisfy the constraint
536 tst_find(r'a+?', 'aaaaa', 'a')
537 // Forced expansion: Must match all 'a's to finally match 'b' (backtracking test)
538 tst_find(r'a+?b', 'aaab', 'aaab')
539
540 // 3. Lazy Question Mark (??)
541 // Should match empty string (prefers 0 occurrences over 1)
542 tst_find(r'a??', 'a', '')
543 // Contextual: 'u' is lazy (prefers skip), matches 'color' immediately
544 tst_find(r'colou??r', 'color', 'color')
545 // Contextual: 'u' is lazy, tries skip, fails to match 'r', backtracks to match 'u'
546 tst_find(r'colou??r', 'colour', 'colour')
547
548 // 4. Lazy Range ({m,n}?)
549 // Should match minimum required (2 digits)
550 tst_find(r'\d{2,5}?', '123456789', '12')
551 // Contrast with greedy which matches maximum (5 digits)
552 tst_find(r'\d{2,5}', '123456789', '12345')
553
554 // 5. Complex/Real-world Case (User report)
555 // Escaped characters + lazy capture group
556 // Should match only '$t(common.hello)', not the span to the second ')'
557 tst_find(r'\$t\((.*?)\)', r'$t(common.hello) dear $t(common.name)', r'$t(common.hello)')
558
559 // --- Negative / Edge Cases ---
560
561 // Lazy quantifier with no termination in string should match nothing/min if possible,
562 // but since it's "find", it grabs the first valid match.
563 tst_find(r'x.*?y', 'x123y456y', 'x123y') // Stops at first y
564
565 // Anchor interaction: ^.*?b
566 // Matches from start, .*? expands lazily until it hits 'b'
567 tst_find(r'^.*?b', '123b', '123b')
568
569 // Ensure lazy doesn't cause failure when a greedy match would succeed (correct backtracking)
570 // Pattern wants to match "a" lazily, but must consume "a" to satisfy the final "a"
571 tst_find(r'a?a', 'a', 'a')
572 tst_find(r'a??a', 'a', 'a')
573}
574
575fn test_compatibility_layer() {
576 // Test new_regex (alias for compile)
577 // Passing '0' as the second argument to simulate the ignored C-flag argument
578 pattern := r'(\w+)\s+(\d+)'
579 re := pcre.new_regex(pattern, 0) or {
580 assert false, 'new_regex failed to compile: ${err}'
581 return
582 }
583
584 text := 'item 42 ignored item 99'
585
586 // Test match_str (alias for find_from)
587 // We start searching from index 0. The third argument '0' is the ignored option flag.
588 // This should match "item 42"
589 m1 := re.match_str(text, 0, 0) or {
590 assert false, 'match_str failed to find match'
591 return
592 }
593
594 // Test get()
595 // Index 0 should be the full text of the match
596 full_match := m1.get(0) or { '' }
597 assert full_match == 'item 42'
598
599 // Index 1 should be the first capture group (\w+)
600 group_1 := m1.get(1) or { '' }
601 assert group_1 == 'item'
602
603 // Index 2 should be the second capture group (\d+)
604 group_2 := m1.get(2) or { '' }
605 assert group_2 == '42'
606
607 // Index 3 should be none (out of bounds)
608 if _ := m1.get(3) {
609 assert false, 'get(3) should return none for 2 groups'
610 }
611
612 // Test get_all()
613 // Should return ['item 42', 'item', '42']
614 all_captures := m1.get_all()
615 assert all_captures.len == 3
616 assert all_captures[0] == 'item 42'
617 assert all_captures[1] == 'item'
618 assert all_captures[2] == '42'
619
620 // Test match_str with a specific start index
621 // Start searching after "item 42" (length is 7)
622 // This should match "item 99"
623 m2 := re.match_str(text, 7, 0) or {
624 assert false, 'match_str failed to find second match from offset'
625 return
626 }
627
628 assert m2.get(0) or { '' } == 'item 99'
629 assert m2.get(2) or { '' } == '99'
630
631 // Test match_str failure case
632 // Start searching at the very end of string
633 no_match := re.match_str(text, text.len, 0)
634 if _ := no_match {
635 assert false, 'match_str should return none when no match is found'
636 }
637}
638
639fn test_hex_escapes() {
640 // \xHH — two hex digits
641 tst_find(r'\x41', 'ABC', 'A') // 0x41 = 'A'
642 tst_find(r'\x61', 'abc', 'a') // 0x61 = 'a'
643 tst_find(r'\x41+', 'AAAB', 'AAA')
644 tst_find(r'\x20\x41', ' A test', ' A') // space + 'A'
645
646 // \XHHHH — four hex digits (Unicode codepoint)
647 tst_find(r'\X0041', 'ABC', 'A') // U+0041 = 'A'
648 tst_find(r'\X0061', 'abc', 'a') // U+0061 = 'a'
649 tst_find(r'\X03B1', 'αβγ', 'α') // U+03B1 = 'α'
650
651 // Mix with other escapes
652 tst_find(r'\x48\x65\x6C\x6C\x6F', 'Hello World', 'Hello') // \x48\x65\x6C\x6C\x6F = "Hello"
653
654 // Invalid hex escape compile errors
655 tst_compile_error(r'\x4') // only 1 digit
656 tst_compile_error(r'\xGG') // invalid hex chars
657 tst_compile_error(r'\X004') // only 3 digits
658}
659
660fn test_duplicate_named_groups() {
661 // Compile error: same name used twice
662 tst_compile_error(r'(?P<id>\d+)-(?P<id>\w+)')
663 // Different names are fine
664 r := pcre.compile(r'(?P<a>\d+)-(?P<b>\w+)') or {
665 assert false, 'Should compile: ${err}'
666 return
667 }
668 m := r.find('12-abc') or {
669 assert false, 'Should match'
670 return
671 }
672 assert r.group_by_name(m, 'a') == '12'
673 assert r.group_by_name(m, 'b') == 'abc'
674}
675
676fn test_invalid_quantifier_ranges() {
677 // min > max is an error
678 tst_compile_error(r'a{3,1}')
679 tst_compile_error(r'a{5,2}')
680 // negative min-like patterns (parsed as 0)
681 // {0,0} should compile and match empty string
682 r := pcre.compile(r'a{0,0}b') or {
683 assert false, 'Should compile: ${err}'
684 return
685 }
686 m := r.find('b') or {
687 assert false, 'Should match'
688 return
689 }
690 assert m.text == 'b'
691}
692
693fn test_find_all_utf8_safety() {
694 // find_all with an empty-matching pattern must not get stuck inside a multi-byte rune
695 r := pcre.compile(r'x*') or { panic(err) }
696 matches := r.find_all('aé') // 'é' is 2 bytes (0xC3 0xA9)
697 // Every result start/end must align on a rune boundary
698 for m in matches {
699 text_bytes := 'aé'.bytes()
700 if m.start < text_bytes.len {
701 // byte at start must not be a UTF-8 continuation byte
702 assert (text_bytes[m.start] & 0xC0) != 0x80, 'Misaligned match start at ${m.start}'
703 }
704 }
705 // find_all should not infinite-loop on emoji
706 r2 := pcre.compile(r'y*') or { panic(err) }
707 matches2 := r2.find_all('😀!')
708 assert matches2.len > 0
709}
710