v / vlib / regex / pcre / regex.v
1314 lines · 1224 sloc · 33.92 KB · 2f74099fd5025a7addcb910b2018d885d6720d9a
Raw
1/*
2regex2 0.9.6 beta (VM Edition) - Performance Optimized
3
4Copyright (c) 2026 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8This file contains a regex module based on a Virtual Machine approach.
9
10Features:
11 - **Non-recursive VM**: Safe execution without stack overflow on complex patterns.
12 - UTF8 support with Fast ASCII path.
13 - Literal characters, '.', '*', '{m,n}'.
14 - Short quantifiers: '?', '+'.
15 - Non-greedy quantifiers: '*?', '+?', '??'.
16 - Nested groups: '()'.
17 - Named groups: '(?P<name>...)'.
18 - Non-capturing groups: '(?:...)'.
19 - Character classes: \w, \W, \d, \D, \s, \S, \a, \A.
20 - Hex escapes: \xHH, \XHHHH.
21 - Alternation: '|'.
22 - Character classes: [abc], [^abc], [a-z].
23 - Anchors: '^' (start of string), '$' (end of string).
24 - Word boundaries: '\b' (word boundary), '\B' (non-word boundary).
25 - Inline Flags:
26 - '(?i)' Case insensitive.
27 - '(?m)' Multiline (anchors match newlines).
28 - '(?s)' Dot-all (dot matches newline).
29
30Configuration:
31 - `max_stack_depth`: Controls the dynamic growth limit of the backtracking stack.
32 Default is 2048. Increase this value if you encounter complex patterns failing
33 on deep recursions/backtracking, or decrease it to limit memory usage.
34
35Functions:
36 - `compile(pattern) !Regex` -> Compiles a pattern into a Regex object.
37 - `(r Regex) find(text) ?Match` -> Finds the first match in a string.
38 - `(r Regex) find_all(text) []Match` -> Finds all non-overlapping matches.
39 - `(r Regex) find_from(text, start_index) ?Match` -> Finds the first match starting from a specific index.
40 - `(r Regex) replace(text, repl) string` -> Finds and replaces the first match.
41 - `(r Regex) fullmatch(text string) ?Match` -> Match exact strings.
42 - `(r Regex) group_by_name(m Match, name string) string` -> Get text of a named group.
43
44Key Architectural Features and Optimizations:
45 - **Pre-allocated Machine workspace**: All search operations are zero-allocation.
46 - **Raw pointer access**: Bypasses bounds checking for string and instruction access.
47 - **Fast ASCII Path**: Characters < 128 bypass heavy UTF-8 decoding.
48 - **Instruction merging**: Consecutive character matches are merged into string blocks.
49 - **Bitmap lookups**: ASCII character classes use a 128-bit bitset for O(1) matching.
50 - **NFA Virtual Machine**: Executes bytecode instructions to simulate pattern matching.
51 - **Dynamic Stack Growth**: Automatically expands the backtracking stack to prevent false negatives.
52 - **Zero-Allocation Search**: Reuses a pre-allocated Machine workspace for search operations.
53 - **Anchored Optimization**: Patterns starting with '^' skip the scanning loop.
54 - **Prefix Skipping**: Uses Boyer-Moore-like skipping for literal prefixes.
55
56*/
57
58module pcre
59
60import strconv
61import strings
62
63/******************************************************************************
64*
65* VM Instruction Definition
66*
67******************************************************************************/
68
69// InstType defines the operation codes for the Virtual Machine.
70enum InstType as u8 {
71 match // Halt and signal a successful match.
72 char // Match a single UTF-8 rune.
73 string // Match a sequence of ASCII characters (merged optimization).
74 any // Match any character (dot).
75 class // Match a character class (e.g., [a-z] or \d).
76 split // Branch execution: target_x (primary), target_y (backtrack).
77 jmp // Unconditional jump to target_x.
78 save // Save current string position into a capture group index.
79 assert_start // Assert current position is start of string.
80 assert_end // Assert current position is end of string.
81 assert_line_start // Assert current position is start of a line (multiline mode).
82 assert_line_end // Assert current position is end of a line (multiline mode).
83 assert_bound // Assert word boundary (\b).
84 assert_nbound // Assert non-word boundary (\B).
85}
86
87// Inst represents a single bytecode instruction.
88// Packed for memory locality.
89struct Inst {
90mut:
91 typ InstType
92 val rune // Used by .char
93 val_str string // Used by .string
94 val_len int // Byte length of val_str
95 target_x int // Primary jump/split target
96 target_y int // Backtrack target for .split
97 group_idx int // Capture group index for .save
98 char_class []rune // Literal runes for character classes
99 bitmap [4]u32 // 128-bit bitset for ASCII character classes
100 inverted bool // Negation flag for classes
101 ignore_case bool // Case-insensitive flag
102 dot_all bool // Whether '.' matches '\n'
103}
104
105// Machine provides a workspace for VM execution.
106// To ensure thread safety, this is created per top-level API call.
107pub struct Machine {
108mut:
109 stack []int // Backtracking stack stores: [capture_states..., string_ptr, next_pc]
110 captures []int // Flat array of [start, end] byte indices for groups
111}
112
113// Regex is the compiled regular expression object.
114pub struct Regex {
115pub:
116 pattern string // The original regex string
117 prog []Inst // Compiled bytecode
118 total_groups int // Number of capture groups defined in pattern
119 group_map map[string]int // Mapping of names to indices for (?P<name>...)
120 prefix_lit string // Pre-calculated literal prefix for fast-skip optimization
121 has_prefix bool // Whether a literal prefix exists
122 anchored bool // True if pattern starts with '^' (optimization hint)
123pub mut:
124 max_stack_depth int // User-defined stack limit hint
125}
126
127// Match contains the results of a successful regex match.
128pub struct Match {
129pub:
130 text string // The full text of the match
131 start int // Starting byte index in the source string
132 end int // Ending byte index in the source string
133 groups []string // Sub-strings captured by groups
134}
135
136// Internal structures for Compilation
137struct Quantifier {
138mut:
139 min int
140 max int // -1 for infinity
141 greedy bool
142}
143
144struct Flags {
145mut:
146 ignore_case bool
147 multiline bool
148 dot_all bool
149}
150
151enum NodeType {
152 chr
153 any_char
154 group
155 alternation
156 char_class
157 start_of_string
158 end_of_string
159 word_boundary
160 non_word_boundary
161 word_char
162 non_word_char
163 digit
164 non_digit
165 whitespace
166 non_whitespace
167 lowercase_char
168 uppercase_char
169}
170
171struct Node {
172mut:
173 typ NodeType
174 chr rune
175 quant Quantifier
176 nodes []Node
177 alternatives [][]Node
178 group_capture_index int = -1
179 char_set []rune
180 inverted bool
181 ignore_case bool
182 multiline bool
183 dot_all bool
184}
185
186/******************************************************************************
187*
188* Internal Utilities (Inlined for Speed)
189*
190******************************************************************************/
191
192// read_rune_at decodes a UTF-8 rune from a byte pointer safely.
193// Marked inline to be embedded directly into the VM loop.
194@[inline]
195fn read_rune_at(str &u8, len int, index int) (rune, int) {
196 unsafe {
197 if index >= len {
198 return 0, 0
199 }
200 b0 := u32(str[index])
201 if b0 < 0x80 {
202 return rune(b0), 1
203 }
204 if (b0 & 0xE0) == 0xC0 && index + 1 < len {
205 return rune(((b0 & 0x1F) << 6) | (u32(str[index + 1]) & 0x3F)), 2
206 }
207 if (b0 & 0xF0) == 0xE0 && index + 2 < len {
208 return rune(((b0 & 0x0F) << 12) | ((u32(str[index + 1]) & 0x3F) << 6) | (u32(str[
209 index + 2]) & 0x3F)), 3
210 }
211 if (b0 & 0xF8) == 0xF0 && index + 3 < len {
212 return rune(((b0 & 0x07) << 18) | ((u32(str[index + 1]) & 0x3F) << 12) | ((u32(str[
213 index + 2]) & 0x3F) << 6) | (u32(str[index + 3]) & 0x3F)), 4
214 }
215 }
216 return 0, 0
217}
218
219// is_word_char returns true for alphanumeric ASCII characters and underscore.
220@[inline]
221fn is_word_char(r u8) bool {
222 return (r >= `a` && r <= `z`) || (r >= `A` && r <= `Z`) || (r >= `0` && r <= `9`) || r == `_`
223}
224
225// set_bitmap sets a specific bit in a 128-bit bitset for ASCII matching.
226@[inline]
227fn set_bitmap(mut bitmap [4]u32, r rune) {
228 if r >= 0 && r < 128 {
229 idx := u32(r) >> 5
230 bit := u32(r) & 31
231 bitmap[idx] |= (u32(1) << bit)
232 }
233}
234
235/******************************************************************************
236*
237* Compiler Logic
238*
239******************************************************************************/
240
241// compile transforms a regex pattern string into a Regex object.
242pub fn compile(pattern string) !Regex {
243 mut group_map := map[string]int{}
244 initial_flags := Flags{false, false, false}
245
246 // Phase 1: AST Parsing
247 nodes, _, final_group_count := parse_nodes(pattern, 0, `\0`, 0, initial_flags, mut group_map)!
248
249 root := Node{
250 typ: .group
251 nodes: nodes
252 quant: Quantifier{1, 1, true}
253 }
254
255 // Phase 2: Bytecode Emission
256 mut compiler := Compiler{
257 prog: []Inst{cap: 128}
258 }
259 compiler.emit_node(root)
260 compiler.emit(Inst{ typ: .match })
261
262 // Phase 3: Optimization
263 optimized_prog := compiler.optimize()
264
265 // Detect Prefix and Anchor optimizations
266 mut prefix := ''
267 mut has_prefix := false
268 mut anchored := false
269
270 if optimized_prog.len > 0 {
271 first := optimized_prog[0]
272 if first.typ == .string {
273 prefix = first.val_str
274 has_prefix = true
275 } else if first.typ == .char && !first.ignore_case && first.val < 128 {
276 prefix = unsafe { u8(first.val).ascii_str() }
277 has_prefix = true
278 } else if first.typ == .assert_start {
279 anchored = true
280 }
281 }
282
283 return Regex{
284 max_stack_depth: 2048
285 pattern: pattern
286 prog: optimized_prog
287 total_groups: final_group_count
288 group_map: group_map
289 prefix_lit: prefix
290 has_prefix: has_prefix
291 anchored: anchored
292 }
293}
294
295// new_machine allocates a new VM state machine.
296// This isolates the runtime memory (stack/captures) from the compiled regex, allowing thread-safe usage.
297pub fn (r &Regex) new_machine() Machine {
298 // Pre-allocate enough space for stack and captures to avoid re-allocation in hot path
299 return Machine{
300 stack: []int{len: r.max_stack_depth}
301 captures: []int{len: r.total_groups * 2}
302 }
303}
304
305// Compiler holds the state for generating the bytecode instructions.
306struct Compiler {
307mut:
308 prog []Inst
309}
310
311// emit appends an instruction to the program and returns its index.
312fn (mut c Compiler) emit(i Inst) int {
313 c.prog << i
314 return c.prog.len - 1
315}
316
317// optimize merges consecutive literal characters into single string instructions
318// and resolves jump targets to absolute indices.
319fn (mut c Compiler) optimize() []Inst {
320 mut targets := map[int]bool{}
321 for inst in c.prog {
322 if inst.typ == .split || inst.typ == .jmp {
323 targets[inst.target_x] = true
324 targets[inst.target_y] = true
325 }
326 }
327
328 mut new_prog := []Inst{cap: c.prog.len}
329 mut idx_map := map[int]int{}
330
331 mut i := 0
332 for i < c.prog.len {
333 inst := c.prog[i]
334 idx_map[i] = new_prog.len
335
336 // Optimization: Merge consecutive chars
337 if inst.typ == .char && !inst.ignore_case && inst.val < 128 {
338 mut s_val := unsafe { u8(inst.val).ascii_str() }
339 mut j := i + 1
340 for j < c.prog.len && !targets[j] {
341 next := c.prog[j]
342 if next.typ == .char && !next.ignore_case && next.val < 128 {
343 s_val += unsafe { u8(next.val).ascii_str() }
344 j++
345 } else {
346 break
347 }
348 }
349 if j > i + 1 {
350 new_prog << Inst{
351 typ: .string
352 val_str: s_val
353 val_len: s_val.len
354 }
355 for k := i + 1; k < j; k++ {
356 idx_map[k] = new_prog.len
357 }
358 i = j
359 continue
360 }
361 }
362 new_prog << inst
363 i++
364 }
365 idx_map[c.prog.len] = new_prog.len
366
367 // Fix jump offsets
368 for mut inst in new_prog {
369 if inst.typ == .split || inst.typ == .jmp {
370 inst.target_x = idx_map[inst.target_x] or { inst.target_x }
371 inst.target_y = idx_map[inst.target_y] or { inst.target_y }
372 }
373 }
374 return new_prog
375}
376
377// emit_class generates the instructions for a character class node.
378// It populates the bitmap for O(1) ASCII matching and the slice for Unicode.
379fn (mut c Compiler) emit_class(node Node) {
380 mut bitmap := [4]u32{}
381 mut char_class := node.char_set.clone()
382 mut inverted := node.inverted
383
384 // Pre-compile common classes into the bitmap for O(1) lookups
385 match node.typ {
386 .word_char {
387 for r := `0`; r <= `9`; r++ {
388 set_bitmap(mut bitmap, r)
389 }
390 for r := `a`; r <= `z`; r++ {
391 set_bitmap(mut bitmap, r)
392 }
393 for r := `A`; r <= `Z`; r++ {
394 set_bitmap(mut bitmap, r)
395 }
396 set_bitmap(mut bitmap, `_`)
397 }
398 .non_word_char {
399 inverted = true
400 for r := `0`; r <= `9`; r++ {
401 set_bitmap(mut bitmap, r)
402 }
403 for r := `a`; r <= `z`; r++ {
404 set_bitmap(mut bitmap, r)
405 }
406 for r := `A`; r <= `Z`; r++ {
407 set_bitmap(mut bitmap, r)
408 }
409 set_bitmap(mut bitmap, `_`)
410 }
411 .digit {
412 for r := `0`; r <= `9`; r++ {
413 set_bitmap(mut bitmap, r)
414 }
415 }
416 .non_digit {
417 inverted = true
418 for r := `0`; r <= `9`; r++ {
419 set_bitmap(mut bitmap, r)
420 }
421 }
422 .whitespace {
423 for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
424 set_bitmap(mut bitmap, r)
425 }
426 }
427 .non_whitespace {
428 inverted = true
429 for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
430 set_bitmap(mut bitmap, r)
431 }
432 }
433 .lowercase_char {
434 for r := `a`; r <= `z`; r++ {
435 set_bitmap(mut bitmap, r)
436 }
437 }
438 .uppercase_char {
439 for r := `A`; r <= `Z`; r++ {
440 set_bitmap(mut bitmap, r)
441 }
442 }
443 .char_class {
444 for r in node.char_set {
445 set_bitmap(mut bitmap, r)
446 if node.ignore_case {
447 if r >= `a` && r <= `z` {
448 set_bitmap(mut bitmap, r - 32)
449 } else if r >= `A` && r <= `Z` {
450 set_bitmap(mut bitmap, r + 32)
451 }
452 }
453 }
454 }
455 else {}
456 }
457
458 c.emit(Inst{
459 typ: .class
460 char_class: char_class
461 bitmap: bitmap
462 inverted: inverted
463 ignore_case: node.ignore_case
464 })
465}
466
467// emit_node handles quantifiers and loops, delegating the actual logic to emit_logic.
468fn (mut c Compiler) emit_node(node Node) {
469 for _ in 0 .. node.quant.min {
470 c.emit_logic(node)
471 }
472
473 if node.quant.max == -1 {
474 split_idx := c.emit(Inst{ typ: .split })
475 start_pc := c.prog.len
476 c.emit_logic(node)
477 c.emit(Inst{ typ: .jmp, target_x: split_idx })
478 if node.quant.greedy {
479 c.prog[split_idx].target_x = start_pc
480 c.prog[split_idx].target_y = c.prog.len
481 } else {
482 c.prog[split_idx].target_x = c.prog.len
483 c.prog[split_idx].target_y = start_pc
484 }
485 } else if node.quant.max > node.quant.min {
486 rem := node.quant.max - node.quant.min
487 mut splits := []int{}
488 for _ in 0 .. rem {
489 idx := c.emit(Inst{ typ: .split })
490 match_pc := c.prog.len
491 if node.quant.greedy {
492 c.prog[idx].target_x = match_pc
493 } else {
494 c.prog[idx].target_y = match_pc
495 }
496 c.emit_logic(node)
497 splits << idx
498 }
499 end_pc := c.prog.len
500 for idx in splits {
501 if node.quant.greedy {
502 c.prog[idx].target_y = end_pc
503 } else {
504 c.prog[idx].target_x = end_pc
505 }
506 }
507 }
508}
509
510// emit_logic generates instructions for specific node types (char, group, alternation).
511fn (mut c Compiler) emit_logic(node Node) {
512 match node.typ {
513 .chr {
514 c.emit(Inst{ typ: .char, val: node.chr, ignore_case: node.ignore_case })
515 }
516 .any_char {
517 c.emit(Inst{ typ: .any, dot_all: node.dot_all })
518 }
519 .group {
520 if node.group_capture_index != -1 {
521 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 })
522 }
523 for child in node.nodes {
524 c.emit_node(child)
525 }
526 if node.group_capture_index != -1 {
527 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 + 1 })
528 }
529 }
530 .alternation {
531 if node.alternatives.len == 0 {
532 return
533 }
534 mut end_jumps := []int{}
535 for i := 0; i < node.alternatives.len - 1; i++ {
536 split_idx := c.emit(Inst{ typ: .split })
537 c.prog[split_idx].target_x = c.prog.len
538 for child in node.alternatives[i] {
539 c.emit_node(child)
540 }
541 end_jumps << c.emit(Inst{ typ: .jmp })
542 c.prog[split_idx].target_y = c.prog.len
543 }
544 for child in node.alternatives.last() {
545 c.emit_node(child)
546 }
547 for idx in end_jumps {
548 c.prog[idx].target_x = c.prog.len
549 }
550 }
551 .start_of_string {
552 c.emit(Inst{
553 typ: if node.multiline { InstType.assert_line_start } else { InstType.assert_start }
554 })
555 }
556 .end_of_string {
557 c.emit(Inst{
558 typ: if node.multiline { InstType.assert_line_end } else { InstType.assert_end }
559 })
560 }
561 .word_boundary {
562 c.emit(Inst{ typ: .assert_bound })
563 }
564 .non_word_boundary {
565 c.emit(Inst{ typ: .assert_nbound })
566 }
567 else {
568 c.emit_class(node)
569 }
570 }
571}
572
573// parse_nodes implements a recursive descent parser to construct the AST from the pattern string.
574fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_start int, passed_flags Flags, mut group_map map[string]int) !([]Node, int, int) {
575 mut pos := pos_start
576 mut group_counter := group_counter_start
577 mut current_flags := passed_flags
578 mut alternatives := [][]Node{}
579 mut current_sequence := []Node{}
580
581 for pos < pattern.len {
582 chr, char_len := read_rune_at(pattern.str, pattern.len, pos)
583 if chr == terminator {
584 pos += char_len
585 if alternatives.len > 0 {
586 alternatives << current_sequence
587 return [
588 Node{
589 typ: .alternation
590 alternatives: alternatives
591 quant: Quantifier{1, 1, true}
592 },
593 ], pos, group_counter
594 }
595 return current_sequence, pos, group_counter
596 }
597 if chr == `|` {
598 pos += char_len
599 if current_sequence.len == 0 {
600 return error('Empty alternative')
601 }
602 alternatives << current_sequence
603 current_sequence = []Node{}
604 continue
605 }
606
607 mut parsed_nodes := []Node{}
608 match chr {
609 `^` {
610 pos += char_len
611 parsed_nodes << Node{
612 typ: .start_of_string
613 multiline: current_flags.multiline
614 }
615 }
616 `$` {
617 pos += char_len
618 parsed_nodes << Node{
619 typ: .end_of_string
620 multiline: current_flags.multiline
621 }
622 }
623 `.` {
624 pos += char_len
625 parsed_nodes << Node{
626 typ: .any_char
627 dot_all: current_flags.dot_all
628 }
629 }
630 `*`, `+`, `?`, `{` {
631 return error('Quantifier must follow a token')
632 }
633 `(` {
634 pos += char_len
635 mut idx := -1
636 mut cap := true
637 if pos < pattern.len && pattern[pos] == `?` {
638 pos++
639 if pos < pattern.len && pattern[pos] in [`i`, `m`, `s`] {
640 for pos < pattern.len && pattern[pos] != `)` {
641 match pattern[pos] {
642 `i` { current_flags.ignore_case = true }
643 `m` { current_flags.multiline = true }
644 `s` { current_flags.dot_all = true }
645 else {}
646 }
647
648 pos++
649 }
650 if pos < pattern.len {
651 pos++
652 }
653 continue
654 } else if pos < pattern.len && pattern[pos] == `:` {
655 cap = false
656 pos++
657 } else if pos < pattern.len && pattern[pos] == `P` {
658 pos++
659 if pos >= pattern.len || pattern[pos] != `<` {
660 return error('Invalid named group syntax')
661 }
662 pos++
663 end := pattern.index_after('>', pos) or { -1 }
664 if end == -1 {
665 return error('Unclosed named group')
666 }
667 name := pattern[pos..end]
668 if name in group_map {
669 return error('Duplicate named group: ${name}')
670 }
671 idx = group_counter
672 group_map[name] = idx
673 pos = end + 1
674 }
675 }
676 if cap {
677 if idx == -1 {
678 idx = group_counter
679 }
680 group_counter++
681 }
682 sub, new_p, new_c := parse_nodes(pattern, pos, `)`, group_counter, current_flags, mut
683 group_map)!
684 pos = new_p
685 group_counter = new_c
686 parsed_nodes << Node{
687 typ: .group
688 nodes: sub
689 group_capture_index: idx
690 }
691 }
692 `[` {
693 pos += char_len
694 end := pattern.index_after(']', pos) or { -1 }
695 if end == -1 {
696 return error('Unclosed character class')
697 }
698 content := pattern[pos..end]
699 pos = end + 1
700 inv := content.len > 0 && content[0] == `^`
701 mut set := []rune{}
702 mut i := if inv { 1 } else { 0 }
703 for i < content.len {
704 c, l := read_rune_at(content.str, content.len, i)
705 if i + l + 1 < content.len && content[i + l] == `-` {
706 ec, el := read_rune_at(content.str, content.len, i + l + 1)
707 for r := c; r <= ec; r++ {
708 set << r
709 }
710 i += l + 1 + el
711 } else {
712 set << c
713 i += l
714 }
715 }
716 parsed_nodes << Node{
717 typ: .char_class
718 char_set: set
719 inverted: inv
720 ignore_case: current_flags.ignore_case
721 }
722 }
723 `\\` {
724 pos += char_len
725 if pos >= pattern.len {
726 return error('Trailing backslash')
727 }
728 esc, el := read_rune_at(pattern.str, pattern.len, pos)
729 pos += el
730 match esc {
731 `w` {
732 parsed_nodes << Node{
733 typ: .word_char
734 }
735 }
736 `W` {
737 parsed_nodes << Node{
738 typ: .non_word_char
739 }
740 }
741 `d` {
742 parsed_nodes << Node{
743 typ: .digit
744 }
745 }
746 `D` {
747 parsed_nodes << Node{
748 typ: .non_digit
749 }
750 }
751 `s` {
752 parsed_nodes << Node{
753 typ: .whitespace
754 }
755 }
756 `S` {
757 parsed_nodes << Node{
758 typ: .non_whitespace
759 }
760 }
761 `b` {
762 parsed_nodes << Node{
763 typ: .word_boundary
764 }
765 }
766 `B` {
767 parsed_nodes << Node{
768 typ: .non_word_boundary
769 }
770 }
771 `a` {
772 parsed_nodes << Node{
773 typ: .lowercase_char
774 }
775 }
776 `A` {
777 parsed_nodes << Node{
778 typ: .uppercase_char
779 }
780 }
781 `x` {
782 // \xHH - two hex digits decode to a character
783 if pos + 2 > pattern.len {
784 return error('\\x requires exactly 2 hex digits')
785 }
786 hex_str := pattern[pos..pos + 2]
787 val := strconv.parse_uint(hex_str, 16, 32) or {
788 return error('Invalid hex escape \\x${hex_str}')
789 }
790 pos += 2
791 parsed_nodes << Node{
792 typ: .chr
793 chr: rune(val)
794 ignore_case: current_flags.ignore_case
795 }
796 }
797 `X` {
798 // \XHHHH - four hex digits decode to a Unicode codepoint
799 if pos + 4 > pattern.len {
800 return error('\\X requires exactly 4 hex digits')
801 }
802 hex_str := pattern[pos..pos + 4]
803 val := strconv.parse_uint(hex_str, 16, 32) or {
804 return error('Invalid hex escape \\X${hex_str}')
805 }
806 pos += 4
807 parsed_nodes << Node{
808 typ: .chr
809 chr: rune(val)
810 ignore_case: current_flags.ignore_case
811 }
812 }
813 else {
814 parsed_nodes << Node{
815 typ: .chr
816 chr: esc
817 ignore_case: current_flags.ignore_case
818 }
819 }
820 }
821 }
822 else {
823 pos += char_len
824 parsed_nodes << Node{
825 typ: .chr
826 chr: chr
827 ignore_case: current_flags.ignore_case
828 }
829 }
830 }
831
832 if parsed_nodes.len > 0 {
833 mut q := Quantifier{1, 1, true}
834 if pos < pattern.len {
835 peek := pattern[pos]
836 match peek {
837 `*` {
838 q = Quantifier{0, -1, true}
839 pos++
840 }
841 `+` {
842 q = Quantifier{1, -1, true}
843 pos++
844 }
845 `?` {
846 q = Quantifier{0, 1, true}
847 pos++
848 }
849 `{` {
850 end := pattern.index_after('}', pos) or { -1 }
851 if end == -1 {
852 return error('Unclosed quantifier')
853 }
854 parts := pattern[pos + 1..end].split(',')
855 min := parts[0].int()
856 max := if parts.len > 1 {
857 if parts[1] == '' { -1 } else { parts[1].int() }
858 } else {
859 min
860 }
861 if min < 0 || (max != -1 && max < min) {
862 return error('Invalid quantifier range {${min},${max}}')
863 }
864 q = Quantifier{min, max, true}
865 pos = end + 1
866 }
867 else {}
868 }
869
870 if pos < pattern.len && pattern[pos] == `?` {
871 q.greedy = false
872 pos++
873 }
874 }
875 parsed_nodes.last().quant = q
876 }
877 current_sequence << parsed_nodes
878 }
879
880 if alternatives.len > 0 {
881 if current_sequence.len == 0 {
882 return error('Empty alternative at end of pattern')
883 }
884 alternatives << current_sequence
885 return [
886 Node{
887 typ: .alternation
888 alternatives: alternatives
889 quant: Quantifier{1, 1, true}
890 },
891 ], pos, group_counter
892 }
893 return current_sequence, pos, group_counter
894}
895
896/******************************************************************************
897*
898* Virtual Machine Execution Engine (Highly Optimized)
899*
900******************************************************************************/
901
902// vm_match executes the bytecode against the input string using the provided Machine state.
903// OPTIMIZATION: Uses raw pointers for instruction and stack access to bypass bounds checking.
904@[direct_array_access]
905fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match {
906 unsafe {
907 // Optimization: Cast voidptr to typed pointer for direct indexing
908 mut cap_ptr := &int(m.captures.data)
909 cap_len := m.captures.len
910
911 // Fast clear of captures using pointer arithmetic (memset-like)
912 for i := 0; i < cap_len; i++ {
913 cap_ptr[i] = -1
914 }
915
916 mut sp := start_pos // String Pointer Index
917 mut stack_ptr := 0 // Stack Pointer Offset
918
919 cap_size := r.total_groups * 2
920 frame_size := cap_size + 2 // [captures..., saved_sp, saved_pc]
921
922 // Raw pointers for hot path access
923 prog_start := &Inst(r.prog.data)
924 mut inst_ptr := prog_start // PC as a pointer
925
926 str_ptr := text.str
927 str_len := text.len
928
929 // Cache stack data pointer (cast to typed pointer)
930 mut stack_data := &int(m.stack.data)
931 mut stack_max := m.stack.len
932
933 for {
934 // Check if we walked off the program (should be caught by match inst)
935 // Using pointer arithmetic: offset = (inst_ptr - prog_start)
936
937 match inst_ptr.typ {
938 .match {
939 // Only allocate result strings on successful match
940 mut s_groups := []string{cap: r.total_groups}
941 for i := 0; i < r.total_groups; i++ {
942 s, e := cap_ptr[i * 2], cap_ptr[i * 2 + 1]
943 s_groups << if s != -1 && e >= s { text[s..e] } else { '' }
944 }
945 return Match{
946 text: text[start_pos..sp]
947 start: start_pos
948 end: sp
949 groups: s_groups
950 }
951 }
952 .char {
953 if sp >= str_len {
954 goto backtrack
955 }
956 curr_byte := str_ptr[sp]
957
958 // Fast ASCII path
959 if curr_byte < 128 && inst_ptr.val < 128 {
960 mut c1, mut c2 := curr_byte, u8(inst_ptr.val)
961 if inst_ptr.ignore_case {
962 if c1 >= `a` && c1 <= `z` {
963 c1 -= 32
964 }
965 if c2 >= `a` && c2 <= `z` {
966 c2 -= 32
967 }
968 }
969 if c1 == c2 {
970 sp++
971 inst_ptr++
972 continue
973 }
974 goto backtrack
975 }
976
977 // UTF-8 Path
978 rn, l := read_rune_at(str_ptr, str_len, sp)
979 if l == 0 {
980 goto backtrack
981 }
982
983 mut match_ok := false
984 if inst_ptr.ignore_case {
985 r1 := if rn >= `a` && rn <= `z` { rn - 32 } else { rn }
986 r2 := if inst_ptr.val >= `a` && inst_ptr.val <= `z` {
987 inst_ptr.val - 32
988 } else {
989 inst_ptr.val
990 }
991 if r1 == r2 {
992 match_ok = true
993 }
994 } else if rn == inst_ptr.val {
995 match_ok = true
996 }
997
998 if match_ok {
999 sp += l
1000 inst_ptr++
1001 } else {
1002 goto backtrack
1003 }
1004 }
1005 .string {
1006 if sp + inst_ptr.val_len > str_len {
1007 goto backtrack
1008 }
1009 // Inline memcmp
1010 v_str := inst_ptr.val_str.str
1011 for i in 0 .. inst_ptr.val_len {
1012 if str_ptr[sp + i] != v_str[i] {
1013 goto backtrack
1014 }
1015 }
1016 sp += inst_ptr.val_len
1017 inst_ptr++
1018 }
1019 .class {
1020 if sp >= str_len {
1021 goto backtrack
1022 }
1023 c_byte := str_ptr[sp]
1024 mut matched := false
1025 mut cl := 1
1026
1027 // Optimization: Bitmap lookup for ASCII
1028 if c_byte < 128 {
1029 if (inst_ptr.bitmap[c_byte >> 5] & (u32(1) << (c_byte & 31))) != 0 {
1030 matched = true
1031 }
1032 } else {
1033 rn, l := read_rune_at(str_ptr, str_len, sp)
1034 cl = l
1035 for cr in inst_ptr.char_class {
1036 if cr == rn {
1037 matched = true
1038 break
1039 }
1040 }
1041 }
1042
1043 if matched != inst_ptr.inverted {
1044 sp += cl
1045 inst_ptr++
1046 } else {
1047 goto backtrack
1048 }
1049 }
1050 .any {
1051 if sp >= str_len {
1052 goto backtrack
1053 }
1054 if inst_ptr.dot_all || str_ptr[sp] != `\n` {
1055 _, cl := read_rune_at(str_ptr, str_len, sp)
1056 sp += cl
1057 inst_ptr++
1058 } else {
1059 goto backtrack
1060 }
1061 }
1062 .save {
1063 cap_ptr[inst_ptr.group_idx] = sp
1064 inst_ptr++
1065 }
1066 .split {
1067 if stack_ptr + frame_size >= stack_max {
1068 new_size := stack_max * 2
1069 if new_size > r.max_stack_depth {
1070 goto backtrack
1071 }
1072 m.stack.grow_len(new_size)
1073 stack_data = &int(m.stack.data) // Pointer might change on realloc
1074 stack_max = new_size
1075 }
1076
1077 // Optimization: Unrolled stack push
1078 stack_offset := stack_ptr
1079 for i in 0 .. cap_size {
1080 stack_data[stack_offset + i] = cap_ptr[i]
1081 }
1082 stack_data[stack_offset + cap_size] = sp
1083 // Save backtrack target PC index
1084 stack_data[stack_offset + cap_size + 1] = inst_ptr.target_y
1085
1086 stack_ptr += frame_size
1087 // Jump to primary target (convert index to pointer)
1088 // FIX: Use pointer indexing instead of addition to avoid type mismatch
1089 inst_ptr = &prog_start[inst_ptr.target_x]
1090 }
1091 .jmp {
1092 inst_ptr = &prog_start[inst_ptr.target_x]
1093 }
1094 .assert_start {
1095 if sp == 0 {
1096 inst_ptr++
1097 } else {
1098 goto backtrack
1099 }
1100 }
1101 .assert_end {
1102 if sp == str_len {
1103 inst_ptr++
1104 } else {
1105 goto backtrack
1106 }
1107 }
1108 .assert_line_start {
1109 if sp == 0 || (sp > 0 && str_ptr[sp - 1] == `\n`) {
1110 inst_ptr++
1111 } else {
1112 goto backtrack
1113 }
1114 }
1115 .assert_line_end {
1116 if sp == str_len || str_ptr[sp] == `\n` {
1117 inst_ptr++
1118 } else {
1119 goto backtrack
1120 }
1121 }
1122 .assert_bound, .assert_nbound {
1123 l := if sp > 0 { is_word_char(str_ptr[sp - 1]) } else { false }
1124 r_ := if sp < str_len { is_word_char(str_ptr[sp]) } else { false }
1125 if (inst_ptr.typ == .assert_bound && l != r_)
1126 || (inst_ptr.typ == .assert_nbound && l == r_) {
1127 inst_ptr++
1128 } else {
1129 goto backtrack
1130 }
1131 }
1132 }
1133
1134 continue
1135
1136 backtrack:
1137 if stack_ptr <= 0 {
1138 return none
1139 }
1140
1141 stack_ptr -= frame_size
1142 stack_offset := stack_ptr
1143
1144 // Restore captures
1145 for i in 0 .. cap_size {
1146 cap_ptr[i] = stack_data[stack_offset + i]
1147 }
1148 sp = stack_data[stack_offset + cap_size]
1149 // Restore PC from index (explicit cast for pointer arithmetic)
1150 // FIX: Use pointer indexing instead of addition
1151 inst_ptr = &prog_start[stack_data[stack_offset + cap_size + 1]]
1152 }
1153 }
1154 return none
1155}
1156
1157/******************************************************************************
1158*
1159* Public API
1160*
1161******************************************************************************/
1162
1163// find returns the first match of the regex in text.
1164pub fn (r &Regex) find(text string) ?Match {
1165 return r.find_from(text, 0)
1166}
1167
1168// find_from returns the first match starting from start_index.
1169// Optimized with fast prefix skipping and anchor checks.
1170pub fn (r &Regex) find_from(text string, start_index int) ?Match {
1171 if start_index < 0 || start_index > text.len {
1172 return none
1173 }
1174 mut m := r.new_machine()
1175
1176 // Optimization: Anchored pattern (^) only checks the start
1177 if r.anchored {
1178 if start_index == 0 {
1179 return r.vm_match(text, 0, mut m)
1180 }
1181 // If multiline mode is NOT enabled, ^ only matches index 0.
1182 // If multiline is enabled, we need to check every newline, handled by logic below.
1183 // Note: The compiler sets anchored=true only for ^ at start.
1184 // If multiline flag is set dynamically inside pattern, strict anchoring logic might differ,
1185 // but standard ^ usage benefits here.
1186 }
1187
1188 // Optimization: Boyer-Moore-like literal prefix skip
1189 if r.has_prefix {
1190 mut i := text.index_after(r.prefix_lit, start_index) or { -1 }
1191 for i != -1 {
1192 if res := r.vm_match(text, i, mut m) {
1193 return res
1194 }
1195 i = text.index_after(r.prefix_lit, i + 1) or { -1 }
1196 }
1197 return none
1198 }
1199
1200 for i := start_index; i <= text.len; i++ {
1201 // Skip UTF-8 continuation bytes to ensure we only match at rune boundaries
1202 // 0xC0 (11000000) is the start of a multi-byte sequence, 0x80 (10000000) is a continuation
1203 if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1204 continue
1205 }
1206 if res := r.vm_match(text, i, mut m) {
1207 return res
1208 }
1209 }
1210 return none
1211}
1212
1213// find_all returns all non-overlapping matches in text.
1214pub fn (r &Regex) find_all(text string) []Match {
1215 mut matches := []Match{}
1216 mut m := r.new_machine()
1217 mut i := 0
1218 for i <= text.len {
1219 if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1220 i++
1221 continue
1222 }
1223 if res := r.vm_match(text, i, mut m) {
1224 matches << res
1225 if res.end > i {
1226 i = res.end
1227 } else {
1228 // Empty match: advance by one full rune to avoid infinite loop
1229 _, rune_len := read_rune_at(text.str, text.len, i)
1230 i += if rune_len > 0 { rune_len } else { 1 }
1231 }
1232 } else {
1233 i++
1234 }
1235 }
1236 return matches
1237}
1238
1239// replace finds the first match and replaces it using repl. Supports $1, $2 backreferences.
1240pub fn (r &Regex) replace(text string, repl string) string {
1241 res := r.find(text) or { return text }
1242 mut sb := strings.new_builder(text.len + repl.len)
1243 sb.write_string(text[0..res.start])
1244 mut i := 0
1245 for i < repl.len {
1246 if repl[i] == `$` && i + 1 < repl.len {
1247 next_char := repl[i + 1]
1248 if next_char >= `0` && next_char <= `9` {
1249 d := next_char - `0` - 1
1250 if d >= 0 && d < res.groups.len {
1251 sb.write_string(res.groups[d])
1252 }
1253 i += 2
1254 continue
1255 }
1256 }
1257 sb.write_u8(repl[i])
1258 i++
1259 }
1260 sb.write_string(text[res.end..])
1261 return sb.str()
1262}
1263
1264// fullmatch returns a Match only if the pattern matches the entire text.
1265pub fn (r &Regex) fullmatch(text string) ?Match {
1266 mut m := r.new_machine()
1267 if res := r.vm_match(text, 0, mut m) {
1268 if res.end == text.len {
1269 return res
1270 }
1271 }
1272 return none
1273}
1274
1275// group_by_name retrieves text captured by a named group.
1276pub fn (r &Regex) group_by_name(m Match, name string) string {
1277 idx := r.group_map[name] or { return '' }
1278 return if idx < m.groups.len { m.groups[idx] } else { '' }
1279}
1280
1281/******************************************************************************
1282*
1283* Compatibility Layer
1284*
1285******************************************************************************/
1286
1287// new_regex is a helper wrapper to compile a regex pattern.
1288pub fn new_regex(pattern string, _ int) !Regex {
1289 return compile(pattern)
1290}
1291
1292// match_str is a compatibility alias for find_from.
1293pub fn (r &Regex) match_str(text string, start_index int, _ int) ?Match {
1294 return r.find_from(text, start_index)
1295}
1296
1297// get returns the matched text for a specific group index.
1298// Index 0 returns the full match, 1..n returns capture groups.
1299pub fn (m Match) get(idx int) ?string {
1300 if idx == 0 {
1301 return m.text
1302 }
1303 if idx > 0 && idx <= m.groups.len {
1304 return m.groups[idx - 1]
1305 }
1306 return none
1307}
1308
1309// get_all returns a list of all captured strings, starting with the full match at index 0.
1310pub fn (m Match) get_all() []string {
1311 mut res := [m.text]
1312 res << m.groups
1313 return res
1314}
1315