| 1 | /* |
| 2 | regex_test.v |
| 3 | |
| 4 | Copyright (c) 2026 Dario Deledda. All rights reserved. |
| 5 | Use of this source code is governed by an MIT license |
| 6 | that can be found in the LICENSE file. |
| 7 | */ |
| 8 | import regex.pcre |
| 9 | |
| 10 | fn main() { |
| 11 | println('Running pcre tests...\n') |
| 12 | |
| 13 | test_regex() |
| 14 | test_complex_quantifiers() |
| 15 | test_range_quantifiers() |
| 16 | test_anchors() |
| 17 | test_word_boundaries() |
| 18 | test_flags() |
| 19 | test_named_groups() |
| 20 | test_non_capturing_groups() |
| 21 | |
| 22 | // New features tests |
| 23 | test_find_all() |
| 24 | test_find_from() |
| 25 | test_replace() |
| 26 | // test_stress_vm() |
| 27 | |
| 28 | println('\nAll tests passed!') |
| 29 | } |
| 30 | |
| 31 | // --- New Feature Tests --- |
| 32 | |
| 33 | fn test_find_all() { |
| 34 | println('\n--- Testing find_all() ---') |
| 35 | |
| 36 | // Basic extraction |
| 37 | tst_find_all(r'\d+', '123 abc 456', ['123', '456']) |
| 38 | tst_find_all(r'\w+', 'hi there', ['hi', 'there']) |
| 39 | |
| 40 | // No matches |
| 41 | tst_find_all(r'\d+', 'no numbers', []) |
| 42 | |
| 43 | // Pattern matching empty strings (e.g., boundaries) |
| 44 | // Note: Behavior depends on engine implementation regarding empty matches. |
| 45 | // Current VM advances index if match length is 0 to avoid infinite loop. |
| 46 | // Pattern \b matches at 0 (start), 3 (after 123), 4 (before abc), 7 (after abc) |
| 47 | // But find_all usually returns non-overlapping text. \b returns empty string. |
| 48 | // tst_find_all(r'\b', '123 abc', ['', '', '', '']) // Commented out, specific implementation detail |
| 49 | |
| 50 | // Anchored find_all (should only match once if anchored at start) |
| 51 | tst_find_all(r'^\w+', 'word word word', ['word']) |
| 52 | |
| 53 | // Overlapping logic check (find_all is typically non-overlapping) |
| 54 | // "ana" in "banana". Indices: 1 ("ana"). Next search starts at 4 ("na"). |
| 55 | tst_find_all(r'ana', 'banana', ['ana']) |
| 56 | } |
| 57 | |
| 58 | fn test_find_from() { |
| 59 | println('\n--- Testing find_from() ---') |
| 60 | |
| 61 | text := 'test test test' |
| 62 | |
| 63 | // Start from 0 (finds first) |
| 64 | tst_find_from(r'test', text, 0, 0, 'test') |
| 65 | |
| 66 | // Start from 1 (skips first, finds second) |
| 67 | tst_find_from(r'test', text, 1, 5, 'test') |
| 68 | |
| 69 | // Start from 6 (finds third) |
| 70 | tst_find_from(r'test', text, 6, 10, 'test') |
| 71 | |
| 72 | // Start from end (finds nothing) |
| 73 | tst_find_from(r'test', text, 11, -1, 'none') |
| 74 | |
| 75 | // Out of bounds |
| 76 | tst_find_from(r'test', text, 50, -1, 'none') |
| 77 | |
| 78 | // Start exactly at match position |
| 79 | tst_find_from(r'test', text, 5, 5, 'test') |
| 80 | } |
| 81 | |
| 82 | fn test_replace() { |
| 83 | println('\n--- Testing replace() ---') |
| 84 | |
| 85 | // Simple replacement |
| 86 | tst_replace(r'\d+', 'abc 123 def', 'NUM', 'abc NUM def') |
| 87 | |
| 88 | // Group substitution |
| 89 | tst_replace(r'(\w+), (\w+)', 'Doe, John', '$2 $1', 'John Doe') |
| 90 | |
| 91 | // Multiple replacements? |
| 92 | // The current replace() implementation in the provided code |
| 93 | // usually replaces the FIRST occurrence (based on find()). |
| 94 | // Let's verify: |
| 95 | tst_replace(r'a', 'bananas', 'o', 'bonanas') |
| 96 | |
| 97 | tst_replace(r'(^[#.]+)|([#.]+$)', r'_#abc.#_ab#', '*', '_#abc.#_ab*') |
| 98 | |
| 99 | // Invalid group index (should ignore or remove) |
| 100 | tst_replace(r'(\d+)', '123', 'Num: $9', 'Num: ') |
| 101 | } |
| 102 | |
| 103 | /* |
| 104 | fn test_stress_vm() { |
| 105 | println('\n--- Testing VM Stability (Stress Test) ---') |
| 106 | // Recursive engines often crash on patterns like (a*)* or very long strings |
| 107 | // if not carefully managed. The VM should handle this via heap stack. |
| 108 | |
| 109 | long_text := 'a'.repeat(2000) |
| 110 | tst_find(r'a+', long_text, long_text) |
| 111 | |
| 112 | println(' [Pass] Long string match') |
| 113 | |
| 114 | // Backtracking stress |
| 115 | // Pattern: (a+)+b matching aaaaa....a (fails) |
| 116 | // This forces extensive backtracking. |
| 117 | short_text := 'a'.repeat(25) |
| 118 | mut r := pcre.compile(r'(a+)+b') or { panic(err) } |
| 119 | r.max_stack_depth = 4000 // increase the stack depth for this test |
| 120 | res := r.find(short_text) |
| 121 | assert res == none |
| 122 | println(' [Pass] Backtracking stress test') |
| 123 | } |
| 124 | */ |
| 125 | |
| 126 | // --- Existing Tests --- |
| 127 | |
| 128 | fn test_flags() { |
| 129 | println('\n--- Testing Flags ((?i), (?m), (?s)) ---') |
| 130 | |
| 131 | // 1. Case Insensitive (?i) |
| 132 | tst_find('(?i)cat', 'Cat', 'Cat') |
| 133 | tst_find('(?i)CAT', 'cat', 'cat') |
| 134 | tst_find('(?i)[a-z]+', 'UPPER', 'UPPER') // char class expansion |
| 135 | tst_find('(?i)x', 'X', 'X') |
| 136 | tst_find('(?i)x', 'y', 'none') |
| 137 | |
| 138 | // Mixed (flag applies to subsequent tokens) |
| 139 | tst_find('c(?i)at', 'cAT', 'cAT') |
| 140 | tst_find('c(?i)at', 'Cat', 'none') // first 'c' is case-sensitive |
| 141 | |
| 142 | // 2. Multiline (?m) |
| 143 | // ^ matches start of line |
| 144 | tst_find('(?m)^line2', 'line1\nline2', 'line2') |
| 145 | tst_find('^line2', 'line1\nline2', 'none') // Default: matches only start of string |
| 146 | |
| 147 | // $ matches end of line |
| 148 | tst_find('(?m)line1$', 'line1\nline2', 'line1') |
| 149 | tst_find('line1$', 'line1\nline2', 'none') // Default: matches only end of string |
| 150 | |
| 151 | // 3. Dot-all / Singleline (?s) |
| 152 | // . matches newline |
| 153 | tst_find('(?s)a.b', 'a\nb', 'a\nb') |
| 154 | tst_find('a.b', 'a\nb', 'none') // Default: . does not match \n |
| 155 | |
| 156 | // 4. Combined Flags (?im) |
| 157 | tst_find('(?im)^line2', 'LINE1\nLINE2', 'LINE2') |
| 158 | |
| 159 | // --- Negative Tests (Flags) --- |
| 160 | tst_find('(?i)cat', 'dog', 'none') |
| 161 | tst_find('(?m)^line2', 'line1 line2', 'none') // Not at start of line |
| 162 | } |
| 163 | |
| 164 | fn test_word_boundaries() { |
| 165 | println('\n--- Testing Word Boundaries (\\b and \\B) ---') |
| 166 | |
| 167 | // 1. Word Boundary (\b) |
| 168 | tst_find('\\bcat', 'cat', 'cat') |
| 169 | tst_find('\\bcat', 'concat', 'none') |
| 170 | tst_find('\\bcat', 'catapult', 'cat') |
| 171 | |
| 172 | tst_find('cat\\b', 'cat', 'cat') |
| 173 | tst_find('cat\\b', 'concat', 'cat') |
| 174 | tst_find('cat\\b', 'catapult', 'none') |
| 175 | |
| 176 | tst_find('\\bcat\\b', 'cat', 'cat') |
| 177 | tst_find('\\bcat\\b', 'a cat is here', 'cat') |
| 178 | tst_find('\\bcat\\b', 'concat', 'none') |
| 179 | tst_find('\\bcat\\b', 'catapult', 'none') |
| 180 | |
| 181 | tst_find('\\btest\\b', 'test.', 'test') |
| 182 | tst_find('\\btest\\b', '(test)', 'test') |
| 183 | |
| 184 | // 2. Non-Word Boundary (\B) |
| 185 | tst_find('a\\B', 'ab', 'a') |
| 186 | tst_find('a\\B', 'a.', 'none') |
| 187 | tst_find('\\Bcat', 'concat', 'cat') |
| 188 | tst_find('\\Bcat', 'cat', 'none') |
| 189 | tst_find('cat\\B', 'catapult', 'cat') |
| 190 | |
| 191 | // --- Negative Tests (Word Boundaries) --- |
| 192 | tst_find('\\b\\w+\\b', '... ...', 'none') |
| 193 | tst_find('\\B', 'a', 'none') |
| 194 | } |
| 195 | |
| 196 | fn test_anchors() { |
| 197 | println('\n--- Testing Anchors (^ and $) ---') |
| 198 | |
| 199 | // 1. Start of String (^) |
| 200 | tst_find('^abc', 'abc', 'abc') |
| 201 | tst_find('^abc', 'abcdef', 'abc') |
| 202 | tst_find('^abc', 'abc abc', 'abc') |
| 203 | tst_find('^\\d+', '123 text', '123') |
| 204 | |
| 205 | // 2. End of String ($) |
| 206 | tst_find('xyz$', 'xyz', 'xyz') |
| 207 | tst_find('xyz$', 'abcxyz', 'xyz') |
| 208 | tst_find('\\d+$', 'text 123', '123') |
| 209 | |
| 210 | // 3. Both Anchors (^...$) |
| 211 | tst_find('^hello$', 'hello', 'hello') |
| 212 | |
| 213 | // 4. Zero-width matches |
| 214 | tst_find('^', 'abc', '') |
| 215 | tst_find('$', 'abc', '') |
| 216 | |
| 217 | // 5. Anchors with Alternation |
| 218 | tst_find('^a|b$', 'apple', 'a') |
| 219 | tst_find('^a|b$', 'blob', 'b') |
| 220 | |
| 221 | // 6. Anchors with Groups |
| 222 | tst_find('^(abc)+$', 'abcabc', 'abcabc') |
| 223 | |
| 224 | // --- Negative Tests (Anchors) --- |
| 225 | tst_find('^abc', 'xyzabc', 'none') |
| 226 | tst_find('^\\d+', 'text 123', 'none') |
| 227 | tst_find('xyz$', 'xyzabc', 'none') |
| 228 | tst_find('\\d+$', '123 text', 'none') |
| 229 | tst_find('^hello$', 'hello world', 'none') |
| 230 | tst_find('^hello$', 'say hello', 'none') |
| 231 | tst_find('^a|b$', 'cba', 'none') |
| 232 | tst_find('^(abc)+$', 'abcabcx', 'none') |
| 233 | tst_find('^$', 'a', 'none') |
| 234 | } |
| 235 | |
| 236 | fn test_regex() { |
| 237 | println('\n--- Testing Basic Features ---') |
| 238 | tst_find('a?b', 'ab', 'ab') |
| 239 | tst_find('a?b', 'b', 'b') |
| 240 | tst_find('a+b', 'aaab', 'aaab') |
| 241 | tst_find('a*b', 'b', 'b') |
| 242 | tst_find('\\d+', '123 abc', '123') |
| 243 | |
| 244 | println('\n--- Testing Character Classes ---') |
| 245 | tst_find('\\w+', 'word1_ and', 'word1_') |
| 246 | tst_find('\\W+', ' and', ' ') |
| 247 | tst_find('\\s+', ' start', ' ') |
| 248 | tst_find('\\d{3}-\\d{4}', 'call 555-1234 now', '555-1234') |
| 249 | tst_find('\\D+', 'call 555', 'call ') |
| 250 | tst_find('\\a+', 'lowercase', 'lowercase') |
| 251 | tst_find('\\A+', 'UPPER', 'UPPER') |
| 252 | |
| 253 | println('\n--- Testing Alternation (|) ---') |
| 254 | tst_find('cat|dog', 'the dog says meow', 'dog') |
| 255 | tst_find('a(b|c)d', 'acd', 'acd') |
| 256 | tst_find('apple|apply', 'I want to apply', 'apply') |
| 257 | |
| 258 | println('\n--- Testing Custom Character Classes ([...]) ---') |
| 259 | tst_find('[aeiou]', 'hello world', 'e') |
| 260 | tst_find('gr[ae]y', 'the color grey', 'grey') |
| 261 | tst_find('[^aeiou]+', 'rhythm', 'rhythm') |
| 262 | tst_find('[a-z]+', 'lowercase123', 'lowercase') |
| 263 | tst_find('[a-zA-Z0-9_]+', 'word_1_with_everything', 'word_1_with_everything') |
| 264 | |
| 265 | println('\n--- Testing Unicode ---') |
| 266 | tst_find('日本語', 'Text containing 日本語.', '日本語') |
| 267 | tst_find('h.llo', 'héllo wørld', 'héllo') |
| 268 | tst_find('(é)+', 'cafééé', 'ééé') |
| 269 | tst_find('😀+', 'Happy 😀😀 day', '😀😀') |
| 270 | |
| 271 | println('\n--- Testing fullmatch() ---') |
| 272 | tst_fullmatch(r'\d+', '12345', '12345') |
| 273 | tst_fullmatch('(?s).*', 'Any content including 😀', 'Any content including 😀') |
| 274 | |
| 275 | // --- Negative Tests (Basic) --- |
| 276 | tst_find('abc', 'ab', 'none') |
| 277 | tst_find('abc', 'acb', 'none') |
| 278 | tst_find('a+b', 'b', 'none') |
| 279 | tst_find('\\d+', 'abc', 'none') |
| 280 | tst_find('\\D+', '123', 'none') |
| 281 | tst_find('\\w+', '@#$', 'none') |
| 282 | tst_find('\\s+', 'Text', 'none') |
| 283 | tst_find('\\a+', 'UPPERCASE', 'none') |
| 284 | tst_find('\\A+', 'lowercase', 'none') |
| 285 | tst_find('cat|dog', 'bird', 'none') |
| 286 | tst_find('[0-9]', 'a', 'none') |
| 287 | tst_find('[^0-9]', '1', 'none') |
| 288 | tst_fullmatch(r'\d+', '12345abc', 'none') |
| 289 | tst_fullmatch(r'\d+', 'abc12345', 'none') |
| 290 | |
| 291 | println('\n--- Testing Compilation Errors ---') |
| 292 | tst_compile_error('a++') |
| 293 | tst_compile_error('[a-z') |
| 294 | tst_compile_error('a|') |
| 295 | } |
| 296 | |
| 297 | fn test_complex_quantifiers() { |
| 298 | println('\n--- Testing Complex Quantifiers (+, *, ?) ---') |
| 299 | |
| 300 | tst_find('a+', 'aaaaa', 'aaaaa') |
| 301 | tst_find('a+b', 'aaaaab', 'aaaaab') |
| 302 | |
| 303 | tst_find('x*y', 'y', 'y') |
| 304 | tst_find('x*y', 'xy', 'xy') |
| 305 | tst_find('x*y', 'xxxy', 'xxxy') |
| 306 | |
| 307 | tst_find('colou?r', 'color', 'color') |
| 308 | tst_find('colou?r', 'colour', 'colour') |
| 309 | tst_find('x?y', 'xy', 'xy') |
| 310 | tst_find('x?y', 'y', 'y') |
| 311 | |
| 312 | tst_find('(ab)+', 'ababab', 'ababab') |
| 313 | tst_find('(ha)+', 'hahaha!', 'hahaha') |
| 314 | |
| 315 | tst_find('(cat|dog)+', 'catdogcat', 'catdogcat') |
| 316 | tst_find('(a|b)+', 'abaabbba', 'abaabbba') |
| 317 | |
| 318 | tst_find('[0-9]+', 'Order 12345', '12345') |
| 319 | tst_find('[a-z]*', '123', '') |
| 320 | |
| 321 | // --- Negative Tests (Complex Quantifiers) --- |
| 322 | tst_find('a+', '', 'none') |
| 323 | tst_find('a+', 'b', 'none') |
| 324 | tst_find('a+b', 'aaac', 'none') |
| 325 | tst_find('x?y', 'x', 'none') |
| 326 | tst_find('(ab)+', 'ac', 'none') |
| 327 | tst_find('[0-9]+', 'abc', 'none') |
| 328 | } |
| 329 | |
| 330 | fn test_range_quantifiers() { |
| 331 | println('\n--- Testing Range Quantifiers {m,n} ---') |
| 332 | |
| 333 | tst_find('a{3}', 'aaa', 'aaa') |
| 334 | tst_find('a{3}', 'aaaa', 'aaa') |
| 335 | |
| 336 | tst_find('a{2,}', 'aa', 'aa') |
| 337 | tst_find('a{2,}', 'aaaaa', 'aaaaa') |
| 338 | |
| 339 | tst_find('a{,3}', 'aaaa', 'aaa') |
| 340 | tst_find('a{,3}', 'aa', 'aa') |
| 341 | tst_find('a{,3}', '', '') |
| 342 | |
| 343 | tst_find('a{2,4}', 'aa', 'aa') |
| 344 | tst_find('a{2,4}', 'aaa', 'aaa') |
| 345 | tst_find('a{2,4}', 'aaaa', 'aaaa') |
| 346 | tst_find('a{2,4}', 'aaaaa', 'aaaa') |
| 347 | |
| 348 | tst_find(r'\d{2,4}-\w{2}', '123-ab', '123-ab') |
| 349 | tst_find(r'\d{2,4}-\w{2}', '12345-ab', '2345-ab') |
| 350 | |
| 351 | // --- Negative Tests (Range Quantifiers) --- |
| 352 | tst_find('a{3}', 'aa', 'none') |
| 353 | tst_find('a{2,}', 'a', 'none') |
| 354 | tst_find('a{2,4}', 'a', 'none') |
| 355 | tst_find(r'\d{2,4}-\w{2}', '1-ab', 'none') |
| 356 | tst_find(r'\d{2,4}-\w{2}', '123-a', 'none') |
| 357 | } |
| 358 | |
| 359 | fn test_named_groups() { |
| 360 | println('\n--- Testing Named Groups ---') |
| 361 | |
| 362 | pattern := '(?P<year>\\d{4})-(?P<month>\\d{2})' |
| 363 | text := 'Date: 2025-01' |
| 364 | r := pcre.compile(pattern) or { panic(err) } |
| 365 | m := r.find(text) or { panic('Match not found') } |
| 366 | |
| 367 | assert m.groups[0] == '2025' |
| 368 | assert m.groups[1] == '01' |
| 369 | |
| 370 | assert r.group_by_name(m, 'year') == '2025' |
| 371 | assert r.group_by_name(m, 'month') == '01' |
| 372 | assert r.group_by_name(m, 'missing') == '' |
| 373 | |
| 374 | nested_pat := '(?P<entry>key: (?P<val>\\d+))' |
| 375 | nested_txt := 'List [ key: 99 ]' |
| 376 | r_nested := pcre.compile(nested_pat) or { panic(err) } |
| 377 | m_nested := r_nested.find(nested_txt) or { panic('Match not found') } |
| 378 | |
| 379 | println('Nested: entry="${r_nested.group_by_name(m_nested, 'entry')}", val="${r_nested.group_by_name(m_nested, |
| 380 | 'val')}"') |
| 381 | assert r_nested.group_by_name(m_nested, 'entry') == 'key: 99' |
| 382 | assert r_nested.group_by_name(m_nested, 'val') == '99' |
| 383 | |
| 384 | pattern_mixed := '(?P<key>\\w+): (\\d+)' |
| 385 | text_mixed := 'Price: 100' |
| 386 | r_mixed := pcre.compile(pattern_mixed) or { panic(err) } |
| 387 | m_mixed := r_mixed.find(text_mixed) or { panic('Match not found') } |
| 388 | |
| 389 | assert m_mixed.groups[0] == 'Price' |
| 390 | assert m_mixed.groups[1] == '100' |
| 391 | assert r_mixed.group_by_name(m_mixed, 'key') == 'Price' |
| 392 | |
| 393 | p_seq := '(?P<a>a)(?P<b>b)(?P<c>c)' |
| 394 | t_seq := 'abc' |
| 395 | r_seq := pcre.compile(p_seq) or { panic(err) } |
| 396 | m_seq := r_seq.find(t_seq) or { panic('Match not found') } |
| 397 | assert r_seq.group_by_name(m_seq, 'a') == 'a' |
| 398 | assert r_seq.group_by_name(m_seq, 'b') == 'b' |
| 399 | assert r_seq.group_by_name(m_seq, 'c') == 'c' |
| 400 | |
| 401 | // --- Negative Tests (Named Groups) --- |
| 402 | if _ := r.find('Date: 99-01') { |
| 403 | assert false, 'Should not match' |
| 404 | } else { |
| 405 | println('Found: none (Expected: none)') |
| 406 | } |
| 407 | tst_find('(?P<id>\\d+)', 'abc', 'none') |
| 408 | } |
| 409 | |
| 410 | fn test_non_capturing_groups() { |
| 411 | println('\n--- Testing Non-Capturing Groups ---') |
| 412 | |
| 413 | tst_find_with_groups('(?:a|b)c', 'ac', 'ac', []) |
| 414 | |
| 415 | tst_find_with_groups('(a)(?:b)(c)', 'abc', 'abc', ['a', 'c']) |
| 416 | |
| 417 | tst_find_with_groups('(a(?:b)c)', 'abc', 'abc', ['abc']) |
| 418 | |
| 419 | tst_find_with_groups('(?:header): (\\d+)', 'header: 123', 'header: 123', ['123']) |
| 420 | |
| 421 | // --- Negative Tests (Non-Capturing Groups) --- |
| 422 | tst_find('(?:a|b)c', 'dc', 'none') |
| 423 | tst_find('(?:a)b', 'c', 'none') |
| 424 | } |
| 425 | |
| 426 | // --- Helper Functions --- |
| 427 | |
| 428 | fn tst_find(pattern string, text string, expected string) { |
| 429 | print('[find] Pattern: "${pattern}", Text: "${text}" -> ') |
| 430 | r := pcre.compile(pattern) or { |
| 431 | println('Compile error: ${err}') |
| 432 | assert false, 'Unexpected compile error: ${err}' |
| 433 | return |
| 434 | } |
| 435 | match_res := r.find(text) |
| 436 | check_result(match_res, expected) |
| 437 | } |
| 438 | |
| 439 | fn tst_find_all(pattern string, text string, expected []string) { |
| 440 | print('[find_all] Pattern: "${pattern}", Text: "${text}" -> ') |
| 441 | r := pcre.compile(pattern) or { panic(err) } |
| 442 | matches := r.find_all(text) |
| 443 | |
| 444 | mut res_strs := []string{} |
| 445 | for m in matches { |
| 446 | res_strs << m.text |
| 447 | } |
| 448 | |
| 449 | println('Found: ${res_strs}') |
| 450 | assert res_strs == expected |
| 451 | } |
| 452 | |
| 453 | fn tst_find_from(pattern string, text string, start int, expected_pos int, expected_text string) { |
| 454 | print('[find_from] Pattern: "${pattern}", Start: ${start} -> ') |
| 455 | r := pcre.compile(pattern) or { panic(err) } |
| 456 | match_res := r.find_from(text, start) |
| 457 | |
| 458 | if match_res != none { |
| 459 | println('Found: "${match_res.text}" at ${match_res.start}') |
| 460 | assert match_res.text == expected_text |
| 461 | assert match_res.start == expected_pos |
| 462 | } else { |
| 463 | println('Found: none') |
| 464 | assert expected_text == 'none' |
| 465 | } |
| 466 | } |
| 467 | |
| 468 | fn tst_replace(pattern string, text string, repl string, expected string) { |
| 469 | print('[replace] Pattern: "${pattern}", Repl: "${repl}" -> ') |
| 470 | r := pcre.compile(pattern) or { panic(err) } |
| 471 | res := r.replace(text, repl) |
| 472 | println('Result: "${res}"') |
| 473 | assert res == expected |
| 474 | } |
| 475 | |
| 476 | fn tst_fullmatch(pattern string, text string, expected string) { |
| 477 | print('[fullmatch] Pattern: "${pattern}", Text: "${text}" -> ') |
| 478 | r := pcre.compile(pattern) or { |
| 479 | println('Compile error: ${err}') |
| 480 | assert false, 'Unexpected compile error: ${err}' |
| 481 | return |
| 482 | } |
| 483 | match_res := r.fullmatch(text) |
| 484 | check_result(match_res, expected) |
| 485 | } |
| 486 | |
| 487 | fn check_result(match_res ?pcre.Match, expected string) { |
| 488 | if match_res != none { |
| 489 | println('Found: "${match_res.text}" (Expected: "${expected}")') |
| 490 | assert match_res.text == expected |
| 491 | } else { |
| 492 | println('Found: none (Expected: "${expected}")') |
| 493 | assert expected == 'none' |
| 494 | } |
| 495 | } |
| 496 | |
| 497 | fn tst_find_with_groups(pattern string, text string, expected_match string, expected_groups []string) { |
| 498 | print('[find+groups] Pattern: "${pattern}", Text: "${text}" -> ') |
| 499 | r := pcre.compile(pattern) or { |
| 500 | println('Compile error: ${err}') |
| 501 | assert false, 'Unexpected compile error: ${err}' |
| 502 | return |
| 503 | } |
| 504 | match_res := r.find(text) |
| 505 | if match_res != none { |
| 506 | println('Found: "${match_res.text}", Groups: ${match_res.groups}') |
| 507 | assert match_res.text == expected_match |
| 508 | assert match_res.groups == expected_groups |
| 509 | } else { |
| 510 | println('Found: none') |
| 511 | assert false // Should have found a match_res |
| 512 | } |
| 513 | } |
| 514 | |
| 515 | fn tst_compile_error(pattern string) { |
| 516 | print('[compile_error] Pattern: "${pattern}" -> ') |
| 517 | _ := pcre.compile(pattern) or { |
| 518 | println('Caught expected error: ${err}') |
| 519 | return |
| 520 | } |
| 521 | println('Error: Did not get a compilation error!') |
| 522 | assert false |
| 523 | } |
| 524 | |
| 525 | fn test_non_greedy_quantifiers() { |
| 526 | println('\n--- Testing Non-Greedy Quantifiers (*?, +?, ??, {m,n}?) ---') |
| 527 | |
| 528 | // 1. Lazy Star (*?) |
| 529 | // Should stop at the first closing '>' (minimal match) |
| 530 | tst_find(r'<.*?>', '<div>content</div>', '<div>') |
| 531 | // Contrast with greedy (default) which consumes until the last '>' |
| 532 | tst_find(r'<.*>', '<div>content</div>', '<div>content</div>') |
| 533 | |
| 534 | // 2. Lazy Plus (+?) |
| 535 | // Should match minimal characters (1 'a') to satisfy the constraint |
| 536 | tst_find(r'a+?', 'aaaaa', 'a') |
| 537 | // Forced expansion: Must match all 'a's to finally match 'b' (backtracking test) |
| 538 | tst_find(r'a+?b', 'aaab', 'aaab') |
| 539 | |
| 540 | // 3. Lazy Question Mark (??) |
| 541 | // Should match empty string (prefers 0 occurrences over 1) |
| 542 | tst_find(r'a??', 'a', '') |
| 543 | // Contextual: 'u' is lazy (prefers skip), matches 'color' immediately |
| 544 | tst_find(r'colou??r', 'color', 'color') |
| 545 | // Contextual: 'u' is lazy, tries skip, fails to match 'r', backtracks to match 'u' |
| 546 | tst_find(r'colou??r', 'colour', 'colour') |
| 547 | |
| 548 | // 4. Lazy Range ({m,n}?) |
| 549 | // Should match minimum required (2 digits) |
| 550 | tst_find(r'\d{2,5}?', '123456789', '12') |
| 551 | // Contrast with greedy which matches maximum (5 digits) |
| 552 | tst_find(r'\d{2,5}', '123456789', '12345') |
| 553 | |
| 554 | // 5. Complex/Real-world Case (User report) |
| 555 | // Escaped characters + lazy capture group |
| 556 | // Should match only '$t(common.hello)', not the span to the second ')' |
| 557 | tst_find(r'\$t\((.*?)\)', r'$t(common.hello) dear $t(common.name)', r'$t(common.hello)') |
| 558 | |
| 559 | // --- Negative / Edge Cases --- |
| 560 | |
| 561 | // Lazy quantifier with no termination in string should match nothing/min if possible, |
| 562 | // but since it's "find", it grabs the first valid match. |
| 563 | tst_find(r'x.*?y', 'x123y456y', 'x123y') // Stops at first y |
| 564 | |
| 565 | // Anchor interaction: ^.*?b |
| 566 | // Matches from start, .*? expands lazily until it hits 'b' |
| 567 | tst_find(r'^.*?b', '123b', '123b') |
| 568 | |
| 569 | // Ensure lazy doesn't cause failure when a greedy match would succeed (correct backtracking) |
| 570 | // Pattern wants to match "a" lazily, but must consume "a" to satisfy the final "a" |
| 571 | tst_find(r'a?a', 'a', 'a') |
| 572 | tst_find(r'a??a', 'a', 'a') |
| 573 | } |
| 574 | |
| 575 | fn test_compatibility_layer() { |
| 576 | // Test new_regex (alias for compile) |
| 577 | // Passing '0' as the second argument to simulate the ignored C-flag argument |
| 578 | pattern := r'(\w+)\s+(\d+)' |
| 579 | re := pcre.new_regex(pattern, 0) or { |
| 580 | assert false, 'new_regex failed to compile: ${err}' |
| 581 | return |
| 582 | } |
| 583 | |
| 584 | text := 'item 42 ignored item 99' |
| 585 | |
| 586 | // Test match_str (alias for find_from) |
| 587 | // We start searching from index 0. The third argument '0' is the ignored option flag. |
| 588 | // This should match "item 42" |
| 589 | m1 := re.match_str(text, 0, 0) or { |
| 590 | assert false, 'match_str failed to find match' |
| 591 | return |
| 592 | } |
| 593 | |
| 594 | // Test get() |
| 595 | // Index 0 should be the full text of the match |
| 596 | full_match := m1.get(0) or { '' } |
| 597 | assert full_match == 'item 42' |
| 598 | |
| 599 | // Index 1 should be the first capture group (\w+) |
| 600 | group_1 := m1.get(1) or { '' } |
| 601 | assert group_1 == 'item' |
| 602 | |
| 603 | // Index 2 should be the second capture group (\d+) |
| 604 | group_2 := m1.get(2) or { '' } |
| 605 | assert group_2 == '42' |
| 606 | |
| 607 | // Index 3 should be none (out of bounds) |
| 608 | if _ := m1.get(3) { |
| 609 | assert false, 'get(3) should return none for 2 groups' |
| 610 | } |
| 611 | |
| 612 | // Test get_all() |
| 613 | // Should return ['item 42', 'item', '42'] |
| 614 | all_captures := m1.get_all() |
| 615 | assert all_captures.len == 3 |
| 616 | assert all_captures[0] == 'item 42' |
| 617 | assert all_captures[1] == 'item' |
| 618 | assert all_captures[2] == '42' |
| 619 | |
| 620 | // Test match_str with a specific start index |
| 621 | // Start searching after "item 42" (length is 7) |
| 622 | // This should match "item 99" |
| 623 | m2 := re.match_str(text, 7, 0) or { |
| 624 | assert false, 'match_str failed to find second match from offset' |
| 625 | return |
| 626 | } |
| 627 | |
| 628 | assert m2.get(0) or { '' } == 'item 99' |
| 629 | assert m2.get(2) or { '' } == '99' |
| 630 | |
| 631 | // Test match_str failure case |
| 632 | // Start searching at the very end of string |
| 633 | no_match := re.match_str(text, text.len, 0) |
| 634 | if _ := no_match { |
| 635 | assert false, 'match_str should return none when no match is found' |
| 636 | } |
| 637 | } |
| 638 | |
| 639 | fn test_hex_escapes() { |
| 640 | // \xHH — two hex digits |
| 641 | tst_find(r'\x41', 'ABC', 'A') // 0x41 = 'A' |
| 642 | tst_find(r'\x61', 'abc', 'a') // 0x61 = 'a' |
| 643 | tst_find(r'\x41+', 'AAAB', 'AAA') |
| 644 | tst_find(r'\x20\x41', ' A test', ' A') // space + 'A' |
| 645 | |
| 646 | // \XHHHH — four hex digits (Unicode codepoint) |
| 647 | tst_find(r'\X0041', 'ABC', 'A') // U+0041 = 'A' |
| 648 | tst_find(r'\X0061', 'abc', 'a') // U+0061 = 'a' |
| 649 | tst_find(r'\X03B1', 'αβγ', 'α') // U+03B1 = 'α' |
| 650 | |
| 651 | // Mix with other escapes |
| 652 | tst_find(r'\x48\x65\x6C\x6C\x6F', 'Hello World', 'Hello') // \x48\x65\x6C\x6C\x6F = "Hello" |
| 653 | |
| 654 | // Invalid hex escape compile errors |
| 655 | tst_compile_error(r'\x4') // only 1 digit |
| 656 | tst_compile_error(r'\xGG') // invalid hex chars |
| 657 | tst_compile_error(r'\X004') // only 3 digits |
| 658 | } |
| 659 | |
| 660 | fn test_duplicate_named_groups() { |
| 661 | // Compile error: same name used twice |
| 662 | tst_compile_error(r'(?P<id>\d+)-(?P<id>\w+)') |
| 663 | // Different names are fine |
| 664 | r := pcre.compile(r'(?P<a>\d+)-(?P<b>\w+)') or { |
| 665 | assert false, 'Should compile: ${err}' |
| 666 | return |
| 667 | } |
| 668 | m := r.find('12-abc') or { |
| 669 | assert false, 'Should match' |
| 670 | return |
| 671 | } |
| 672 | assert r.group_by_name(m, 'a') == '12' |
| 673 | assert r.group_by_name(m, 'b') == 'abc' |
| 674 | } |
| 675 | |
| 676 | fn test_invalid_quantifier_ranges() { |
| 677 | // min > max is an error |
| 678 | tst_compile_error(r'a{3,1}') |
| 679 | tst_compile_error(r'a{5,2}') |
| 680 | // negative min-like patterns (parsed as 0) |
| 681 | // {0,0} should compile and match empty string |
| 682 | r := pcre.compile(r'a{0,0}b') or { |
| 683 | assert false, 'Should compile: ${err}' |
| 684 | return |
| 685 | } |
| 686 | m := r.find('b') or { |
| 687 | assert false, 'Should match' |
| 688 | return |
| 689 | } |
| 690 | assert m.text == 'b' |
| 691 | } |
| 692 | |
| 693 | fn test_find_all_utf8_safety() { |
| 694 | // find_all with an empty-matching pattern must not get stuck inside a multi-byte rune |
| 695 | r := pcre.compile(r'x*') or { panic(err) } |
| 696 | matches := r.find_all('aé') // 'é' is 2 bytes (0xC3 0xA9) |
| 697 | // Every result start/end must align on a rune boundary |
| 698 | for m in matches { |
| 699 | text_bytes := 'aé'.bytes() |
| 700 | if m.start < text_bytes.len { |
| 701 | // byte at start must not be a UTF-8 continuation byte |
| 702 | assert (text_bytes[m.start] & 0xC0) != 0x80, 'Misaligned match start at ${m.start}' |
| 703 | } |
| 704 | } |
| 705 | // find_all should not infinite-loop on emoji |
| 706 | r2 := pcre.compile(r'y*') or { panic(err) } |
| 707 | matches2 := r2.find_all('😀!') |
| 708 | assert matches2.len > 0 |
| 709 | } |
| 710 | |