Gitly


1 /*
2 regex2 0.9.6 beta (VM Edition) - Performance Optimized
3 
4 Copyright (c) 2026 Dario Deledda. All rights reserved.
5 Use of this source code is governed by an MIT license
6 that can be found in the LICENSE file.
7 
8 This file contains a regex module based on a Virtual Machine approach.
9 
10 Features:
11  - **Non-recursive VM**: Safe execution without stack overflow on complex patterns.
12  - UTF8 support with Fast ASCII path.
13  - Literal characters, '.', '*', '{m,n}'.
14  - Short quantifiers: '?', '+'.
15  - Non-greedy quantifiers: '*?', '+?', '??'.
16  - Nested groups: '()'.
17  - Named groups: '(?P<name>...)'.
18  - Non-capturing groups: '(?:...)'.
19  - Character classes: \w, \W, \d, \D, \s, \S, \a, \A.
20  - Hex escapes: \xHH, \XHHHH.
21  - Alternation: '|'.
22  - Character classes: [abc], [^abc], [a-z].
23  - Anchors: '^' (start of string), '$' (end of string).
24  - Word boundaries: '\b' (word boundary), '\B' (non-word boundary).
25  - Inline Flags:
26    - '(?i)' Case insensitive.
27    - '(?m)' Multiline (anchors match newlines).
28    - '(?s)' Dot-all (dot matches newline).
29 
30 Configuration:
31  - `max_stack_depth`: Controls the dynamic growth limit of the backtracking stack.
32     Default is 2048. Increase this value if you encounter complex patterns failing
33     on deep recursions/backtracking, or decrease it to limit memory usage.
34 
35 Functions:
36  - `compile(pattern) !Regex` -> Compiles a pattern into a Regex object.
37  - `(r Regex) find(text) ?Match` -> Finds the first match in a string.
38  - `(r Regex) find_all(text) []Match` -> Finds all non-overlapping matches.
39  - `(r Regex) find_from(text, start_index) ?Match` -> Finds the first match starting from a specific index.
40  - `(r Regex) replace(text, repl) string` -> Finds and replaces the first match.
41  - `(r Regex) fullmatch(text string) ?Match` -> Match exact strings.
42  - `(r Regex) group_by_name(m Match, name string) string` -> Get text of a named group.
43 
44 Key Architectural Features and Optimizations:
45  - **Pre-allocated Machine workspace**: All search operations are zero-allocation.
46  - **Raw pointer access**: Bypasses bounds checking for string and instruction access.
47  - **Fast ASCII Path**: Characters < 128 bypass heavy UTF-8 decoding.
48  - **Instruction merging**: Consecutive character matches are merged into string blocks.
49  - **Bitmap lookups**: ASCII character classes use a 128-bit bitset for O(1) matching.
50  - **NFA Virtual Machine**: Executes bytecode instructions to simulate pattern matching.
51  - **Dynamic Stack Growth**: Automatically expands the backtracking stack to prevent false negatives.
52  - **Zero-Allocation Search**: Reuses a pre-allocated Machine workspace for search operations.
53  - **Anchored Optimization**: Patterns starting with '^' skip the scanning loop.
54  - **Prefix Skipping**: Uses Boyer-Moore-like skipping for literal prefixes.
55 
56 */
57 
58 module pcre
59 
60 import strconv
61 import strings
62 
63 /******************************************************************************
64 *
65 * VM Instruction Definition
66 *
67 ******************************************************************************/
68 
69 // InstType defines the operation codes for the Virtual Machine.
70 enum InstType as u8 {
71     match             // Halt and signal a successful match.
72     char              // Match a single UTF-8 rune.
73     string            // Match a sequence of ASCII characters (merged optimization).
74     any               // Match any character (dot).
75     class             // Match a character class (e.g., [a-z] or \d).
76     split             // Branch execution: target_x (primary), target_y (backtrack).
77     jmp               // Unconditional jump to target_x.
78     save              // Save current string position into a capture group index.
79     assert_start      // Assert current position is start of string.
80     assert_end        // Assert current position is end of string.
81     assert_line_start // Assert current position is start of a line (multiline mode).
82     assert_line_end   // Assert current position is end of a line (multiline mode).
83     assert_bound      // Assert word boundary (\b).
84     assert_nbound     // Assert non-word boundary (\B).
85 }
86 
87 // Inst represents a single bytecode instruction.
88 // Packed for memory locality.
89 struct Inst {
90 mut:
91     typ         InstType
92     val         rune   // Used by .char
93     val_str     string // Used by .string
94     val_len     int    // Byte length of val_str
95     target_x    int    // Primary jump/split target
96     target_y    int    // Backtrack target for .split
97     group_idx   int    // Capture group index for .save
98     char_class  []rune // Literal runes for character classes
99     bitmap      [4]u32 // 128-bit bitset for ASCII character classes
100     inverted    bool   // Negation flag for classes
101     ignore_case bool   // Case-insensitive flag
102     dot_all     bool   // Whether '.' matches '\n'
103 }
104 
105 // Machine provides a workspace for VM execution.
106 // To ensure thread safety, this is created per top-level API call.
107 pub struct Machine {
108 mut:
109     stack    []int // Backtracking stack stores: [capture_states..., string_ptr, next_pc]
110     captures []int // Flat array of [start, end] byte indices for groups
111 }
112 
113 // Regex is the compiled regular expression object.
114 pub struct Regex {
115 pub:
116     pattern      string         // The original regex string
117     prog         []Inst         // Compiled bytecode
118     total_groups int            // Number of capture groups defined in pattern
119     group_map    map[string]int // Mapping of names to indices for (?P<name>...)
120     prefix_lit   string         // Pre-calculated literal prefix for fast-skip optimization
121     has_prefix   bool           // Whether a literal prefix exists
122     anchored     bool           // True if pattern starts with '^' (optimization hint)
123 pub mut:
124     max_stack_depth int // User-defined stack limit hint
125 }
126 
127 // Match contains the results of a successful regex match.
128 pub struct Match {
129 pub:
130     text   string   // The full text of the match
131     start  int      // Starting byte index in the source string
132     end    int      // Ending byte index in the source string
133     groups []string // Sub-strings captured by groups
134 }
135 
136 // Internal structures for Compilation
137 struct Quantifier {
138 mut:
139     min    int
140     max    int // -1 for infinity
141     greedy bool
142 }
143 
144 struct Flags {
145 mut:
146     ignore_case bool
147     multiline   bool
148     dot_all     bool
149 }
150 
151 enum NodeType {
152     chr
153     any_char
154     group
155     alternation
156     char_class
157     start_of_string
158     end_of_string
159     word_boundary
160     non_word_boundary
161     word_char
162     non_word_char
163     digit
164     non_digit
165     whitespace
166     non_whitespace
167     lowercase_char
168     uppercase_char
169 }
170 
171 struct Node {
172 mut:
173     typ                 NodeType
174     chr                 rune
175     quant               Quantifier
176     nodes               []Node
177     alternatives        [][]Node
178     group_capture_index int = -1
179     char_set            []rune
180     inverted            bool
181     ignore_case         bool
182     multiline           bool
183     dot_all             bool
184 }
185 
186 /******************************************************************************
187 *
188 * Internal Utilities (Inlined for Speed)
189 *
190 ******************************************************************************/
191 
192 // read_rune_at decodes a UTF-8 rune from a byte pointer safely.
193 // Marked inline to be embedded directly into the VM loop.
194 @[inline]
195 fn read_rune_at(str &u8, len int, index int) (rune, int) {
196     unsafe {
197         if index >= len {
198             return 0, 0
199         }
200         b0 := u32(str[index])
201         if b0 < 0x80 {
202             return rune(b0), 1
203         }
204         if (b0 & 0xE0) == 0xC0 && index + 1 < len {
205             return rune(((b0 & 0x1F) << 6) | (u32(str[index + 1]) & 0x3F)), 2
206         }
207         if (b0 & 0xF0) == 0xE0 && index + 2 < len {
208             return rune(((b0 & 0x0F) << 12) | ((u32(str[index + 1]) & 0x3F) << 6) | (u32(str[
209                 index + 2]) & 0x3F)), 3
210         }
211         if (b0 & 0xF8) == 0xF0 && index + 3 < len {
212             return rune(((b0 & 0x07) << 18) | ((u32(str[index + 1]) & 0x3F) << 12) | ((u32(str[
213                 index + 2]) & 0x3F) << 6) | (u32(str[index + 3]) & 0x3F)), 4
214         }
215     }
216     return 0, 0
217 }
218 
219 // is_word_char returns true for alphanumeric ASCII characters and underscore.
220 @[inline]
221 fn is_word_char(r u8) bool {
222     return (r >= `a` && r <= `z`) || (r >= `A` && r <= `Z`) || (r >= `0` && r <= `9`) || r == `_`
223 }
224 
225 // set_bitmap sets a specific bit in a 128-bit bitset for ASCII matching.
226 @[inline]
227 fn set_bitmap(mut bitmap [4]u32, r rune) {
228     if r >= 0 && r < 128 {
229         idx := u32(r) >> 5
230         bit := u32(r) & 31
231         bitmap[idx] |= (u32(1) << bit)
232     }
233 }
234 
235 /******************************************************************************
236 *
237 * Compiler Logic
238 *
239 ******************************************************************************/
240 
241 // compile transforms a regex pattern string into a Regex object.
242 pub fn compile(pattern string) !Regex {
243     mut group_map := map[string]int{}
244     initial_flags := Flags{false, false, false}
245 
246     // Phase 1: AST Parsing
247     nodes, _, final_group_count := parse_nodes(pattern, 0, `\0`, 0, initial_flags, mut group_map)!
248 
249     root := Node{
250         typ:   .group
251         nodes: nodes
252         quant: Quantifier{1, 1, true}
253     }
254 
255     // Phase 2: Bytecode Emission
256     mut compiler := Compiler{
257         prog: []Inst{cap: 128}
258     }
259     compiler.emit_node(root)
260     compiler.emit(Inst{ typ: .match })
261 
262     // Phase 3: Optimization
263     optimized_prog := compiler.optimize()
264 
265     // Detect Prefix and Anchor optimizations
266     mut prefix := ''
267     mut has_prefix := false
268     mut anchored := false
269 
270     if optimized_prog.len > 0 {
271         first := optimized_prog[0]
272         if first.typ == .string {
273             prefix = first.val_str
274             has_prefix = true
275         } else if first.typ == .char && !first.ignore_case && first.val < 128 {
276             prefix = unsafe { u8(first.val).ascii_str() }
277             has_prefix = true
278         } else if first.typ == .assert_start {
279             anchored = true
280         }
281     }
282 
283     return Regex{
284         max_stack_depth: 2048
285         pattern:         pattern
286         prog:            optimized_prog
287         total_groups:    final_group_count
288         group_map:       group_map
289         prefix_lit:      prefix
290         has_prefix:      has_prefix
291         anchored:        anchored
292     }
293 }
294 
295 // new_machine allocates a new VM state machine.
296 // This isolates the runtime memory (stack/captures) from the compiled regex, allowing thread-safe usage.
297 pub fn (r &Regex) new_machine() Machine {
298     // Pre-allocate enough space for stack and captures to avoid re-allocation in hot path
299     return Machine{
300         stack:    []int{len: r.max_stack_depth}
301         captures: []int{len: r.total_groups * 2}
302     }
303 }
304 
305 // Compiler holds the state for generating the bytecode instructions.
306 struct Compiler {
307 mut:
308     prog []Inst
309 }
310 
311 // emit appends an instruction to the program and returns its index.
312 fn (mut c Compiler) emit(i Inst) int {
313     c.prog << i
314     return c.prog.len - 1
315 }
316 
317 // optimize merges consecutive literal characters into single string instructions
318 // and resolves jump targets to absolute indices.
319 fn (mut c Compiler) optimize() []Inst {
320     mut targets := map[int]bool{}
321     for inst in c.prog {
322         if inst.typ == .split || inst.typ == .jmp {
323             targets[inst.target_x] = true
324             targets[inst.target_y] = true
325         }
326     }
327 
328     mut new_prog := []Inst{cap: c.prog.len}
329     mut idx_map := map[int]int{}
330 
331     mut i := 0
332     for i < c.prog.len {
333         inst := c.prog[i]
334         idx_map[i] = new_prog.len
335 
336         // Optimization: Merge consecutive chars
337         if inst.typ == .char && !inst.ignore_case && inst.val < 128 {
338             mut s_val := unsafe { u8(inst.val).ascii_str() }
339             mut j := i + 1
340             for j < c.prog.len && !targets[j] {
341                 next := c.prog[j]
342                 if next.typ == .char && !next.ignore_case && next.val < 128 {
343                     s_val += unsafe { u8(next.val).ascii_str() }
344                     j++
345                 } else {
346                     break
347                 }
348             }
349             if j > i + 1 {
350                 new_prog << Inst{
351                     typ:     .string
352                     val_str: s_val
353                     val_len: s_val.len
354                 }
355                 for k := i + 1; k < j; k++ {
356                     idx_map[k] = new_prog.len
357                 }
358                 i = j
359                 continue
360             }
361         }
362         new_prog << inst
363         i++
364     }
365     idx_map[c.prog.len] = new_prog.len
366 
367     // Fix jump offsets
368     for mut inst in new_prog {
369         if inst.typ == .split || inst.typ == .jmp {
370             inst.target_x = idx_map[inst.target_x] or { inst.target_x }
371             inst.target_y = idx_map[inst.target_y] or { inst.target_y }
372         }
373     }
374     return new_prog
375 }
376 
377 // emit_class generates the instructions for a character class node.
378 // It populates the bitmap for O(1) ASCII matching and the slice for Unicode.
379 fn (mut c Compiler) emit_class(node Node) {
380     mut bitmap := [4]u32{}
381     mut char_class := node.char_set.clone()
382     mut inverted := node.inverted
383 
384     // Pre-compile common classes into the bitmap for O(1) lookups
385     match node.typ {
386         .word_char {
387             for r := `0`; r <= `9`; r++ {
388                 set_bitmap(mut bitmap, r)
389             }
390             for r := `a`; r <= `z`; r++ {
391                 set_bitmap(mut bitmap, r)
392             }
393             for r := `A`; r <= `Z`; r++ {
394                 set_bitmap(mut bitmap, r)
395             }
396             set_bitmap(mut bitmap, `_`)
397         }
398         .non_word_char {
399             inverted = true
400             for r := `0`; r <= `9`; r++ {
401                 set_bitmap(mut bitmap, r)
402             }
403             for r := `a`; r <= `z`; r++ {
404                 set_bitmap(mut bitmap, r)
405             }
406             for r := `A`; r <= `Z`; r++ {
407                 set_bitmap(mut bitmap, r)
408             }
409             set_bitmap(mut bitmap, `_`)
410         }
411         .digit {
412             for r := `0`; r <= `9`; r++ {
413                 set_bitmap(mut bitmap, r)
414             }
415         }
416         .non_digit {
417             inverted = true
418             for r := `0`; r <= `9`; r++ {
419                 set_bitmap(mut bitmap, r)
420             }
421         }
422         .whitespace {
423             for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
424                 set_bitmap(mut bitmap, r)
425             }
426         }
427         .non_whitespace {
428             inverted = true
429             for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
430                 set_bitmap(mut bitmap, r)
431             }
432         }
433         .lowercase_char {
434             for r := `a`; r <= `z`; r++ {
435                 set_bitmap(mut bitmap, r)
436             }
437         }
438         .uppercase_char {
439             for r := `A`; r <= `Z`; r++ {
440                 set_bitmap(mut bitmap, r)
441             }
442         }
443         .char_class {
444             for r in node.char_set {
445                 set_bitmap(mut bitmap, r)
446                 if node.ignore_case {
447                     if r >= `a` && r <= `z` {
448                         set_bitmap(mut bitmap, r - 32)
449                     } else if r >= `A` && r <= `Z` {
450                         set_bitmap(mut bitmap, r + 32)
451                     }
452                 }
453             }
454         }
455         else {}
456     }
457 
458     c.emit(Inst{
459         typ:         .class
460         char_class:  char_class
461         bitmap:      bitmap
462         inverted:    inverted
463         ignore_case: node.ignore_case
464     })
465 }
466 
467 // emit_node handles quantifiers and loops, delegating the actual logic to emit_logic.
468 fn (mut c Compiler) emit_node(node Node) {
469     for _ in 0 .. node.quant.min {
470         c.emit_logic(node)
471     }
472 
473     if node.quant.max == -1 {
474         split_idx := c.emit(Inst{ typ: .split })
475         start_pc := c.prog.len
476         c.emit_logic(node)
477         c.emit(Inst{ typ: .jmp, target_x: split_idx })
478         if node.quant.greedy {
479             c.prog[split_idx].target_x = start_pc
480             c.prog[split_idx].target_y = c.prog.len
481         } else {
482             c.prog[split_idx].target_x = c.prog.len
483             c.prog[split_idx].target_y = start_pc
484         }
485     } else if node.quant.max > node.quant.min {
486         rem := node.quant.max - node.quant.min
487         mut splits := []int{}
488         for _ in 0 .. rem {
489             idx := c.emit(Inst{ typ: .split })
490             match_pc := c.prog.len
491             if node.quant.greedy {
492                 c.prog[idx].target_x = match_pc
493             } else {
494                 c.prog[idx].target_y = match_pc
495             }
496             c.emit_logic(node)
497             splits << idx
498         }
499         end_pc := c.prog.len
500         for idx in splits {
501             if node.quant.greedy {
502                 c.prog[idx].target_y = end_pc
503             } else {
504                 c.prog[idx].target_x = end_pc
505             }
506         }
507     }
508 }
509 
510 // emit_logic generates instructions for specific node types (char, group, alternation).
511 fn (mut c Compiler) emit_logic(node Node) {
512     match node.typ {
513         .chr {
514             c.emit(Inst{ typ: .char, val: node.chr, ignore_case: node.ignore_case })
515         }
516         .any_char {
517             c.emit(Inst{ typ: .any, dot_all: node.dot_all })
518         }
519         .group {
520             if node.group_capture_index != -1 {
521                 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 })
522             }
523             for child in node.nodes {
524                 c.emit_node(child)
525             }
526             if node.group_capture_index != -1 {
527                 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 + 1 })
528             }
529         }
530         .alternation {
531             if node.alternatives.len == 0 {
532                 return
533             }
534             mut end_jumps := []int{}
535             for i := 0; i < node.alternatives.len - 1; i++ {
536                 split_idx := c.emit(Inst{ typ: .split })
537                 c.prog[split_idx].target_x = c.prog.len
538                 for child in node.alternatives[i] {
539                     c.emit_node(child)
540                 }
541                 end_jumps << c.emit(Inst{ typ: .jmp })
542                 c.prog[split_idx].target_y = c.prog.len
543             }
544             for child in node.alternatives.last() {
545                 c.emit_node(child)
546             }
547             for idx in end_jumps {
548                 c.prog[idx].target_x = c.prog.len
549             }
550         }
551         .start_of_string {
552             c.emit(Inst{
553                 typ: if node.multiline { InstType.assert_line_start } else { InstType.assert_start }
554             })
555         }
556         .end_of_string {
557             c.emit(Inst{
558                 typ: if node.multiline { InstType.assert_line_end } else { InstType.assert_end }
559             })
560         }
561         .word_boundary {
562             c.emit(Inst{ typ: .assert_bound })
563         }
564         .non_word_boundary {
565             c.emit(Inst{ typ: .assert_nbound })
566         }
567         else {
568             c.emit_class(node)
569         }
570     }
571 }
572 
573 // parse_nodes implements a recursive descent parser to construct the AST from the pattern string.
574 fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_start int, passed_flags Flags, mut group_map map[string]int) !([]Node, int, int) {
575     mut pos := pos_start
576     mut group_counter := group_counter_start
577     mut current_flags := passed_flags
578     mut alternatives := [][]Node{}
579     mut current_sequence := []Node{}
580 
581     for pos < pattern.len {
582         chr, char_len := read_rune_at(pattern.str, pattern.len, pos)
583         if chr == terminator {
584             pos += char_len
585             if alternatives.len > 0 {
586                 alternatives << current_sequence
587                 return [
588                     Node{
589                         typ:          .alternation
590                         alternatives: alternatives
591                         quant:        Quantifier{1, 1, true}
592                     },
593                 ], pos, group_counter
594             }
595             return current_sequence, pos, group_counter
596         }
597         if chr == `|` {
598             pos += char_len
599             if current_sequence.len == 0 {
600                 return error('Empty alternative')
601             }
602             alternatives << current_sequence
603             current_sequence = []Node{}
604             continue
605         }
606 
607         mut parsed_nodes := []Node{}
608         match chr {
609             `^` {
610                 pos += char_len
611                 parsed_nodes << Node{
612                     typ:       .start_of_string
613                     multiline: current_flags.multiline
614                 }
615             }
616             `$` {
617                 pos += char_len
618                 parsed_nodes << Node{
619                     typ:       .end_of_string
620                     multiline: current_flags.multiline
621                 }
622             }
623             `.` {
624                 pos += char_len
625                 parsed_nodes << Node{
626                     typ:     .any_char
627                     dot_all: current_flags.dot_all
628                 }
629             }
630             `*`, `+`, `?`, `{` {
631                 return error('Quantifier must follow a token')
632             }
633             `(` {
634                 pos += char_len
635                 mut idx := -1
636                 mut cap := true
637                 if pos < pattern.len && pattern[pos] == `?` {
638                     pos++
639                     if pos < pattern.len && pattern[pos] in [`i`, `m`, `s`] {
640                         for pos < pattern.len && pattern[pos] != `)` {
641                             match pattern[pos] {
642                                 `i` { current_flags.ignore_case = true }
643                                 `m` { current_flags.multiline = true }
644                                 `s` { current_flags.dot_all = true }
645                                 else {}
646                             }
647 
648                             pos++
649                         }
650                         if pos < pattern.len {
651                             pos++
652                         }
653                         continue
654                     } else if pos < pattern.len && pattern[pos] == `:` {
655                         cap = false
656                         pos++
657                     } else if pos < pattern.len && pattern[pos] == `P` {
658                         pos++
659                         if pos >= pattern.len || pattern[pos] != `<` {
660                             return error('Invalid named group syntax')
661                         }
662                         pos++
663                         end := pattern.index_after('>', pos) or { -1 }
664                         if end == -1 {
665                             return error('Unclosed named group')
666                         }
667                         name := pattern[pos..end]
668                         if name in group_map {
669                             return error('Duplicate named group: ${name}')
670                         }
671                         idx = group_counter
672                         group_map[name] = idx
673                         pos = end + 1
674                     }
675                 }
676                 if cap {
677                     if idx == -1 {
678                         idx = group_counter
679                     }
680                     group_counter++
681                 }
682                 sub, new_p, new_c := parse_nodes(pattern, pos, `)`, group_counter, current_flags, mut
683                     group_map)!
684                 pos = new_p
685                 group_counter = new_c
686                 parsed_nodes << Node{
687                     typ:                 .group
688                     nodes:               sub
689                     group_capture_index: idx
690                 }
691             }
692             `[` {
693                 pos += char_len
694                 end := pattern.index_after(']', pos) or { -1 }
695                 if end == -1 {
696                     return error('Unclosed character class')
697                 }
698                 content := pattern[pos..end]
699                 pos = end + 1
700                 inv := content.len > 0 && content[0] == `^`
701                 mut set := []rune{}
702                 mut i := if inv { 1 } else { 0 }
703                 for i < content.len {
704                     c, l := read_rune_at(content.str, content.len, i)
705                     if i + l + 1 < content.len && content[i + l] == `-` {
706                         ec, el := read_rune_at(content.str, content.len, i + l + 1)
707                         for r := c; r <= ec; r++ {
708                             set << r
709                         }
710                         i += l + 1 + el
711                     } else {
712                         set << c
713                         i += l
714                     }
715                 }
716                 parsed_nodes << Node{
717                     typ:         .char_class
718                     char_set:    set
719                     inverted:    inv
720                     ignore_case: current_flags.ignore_case
721                 }
722             }
723             `\\` {
724                 pos += char_len
725                 if pos >= pattern.len {
726                     return error('Trailing backslash')
727                 }
728                 esc, el := read_rune_at(pattern.str, pattern.len, pos)
729                 pos += el
730                 match esc {
731                     `w` {
732                         parsed_nodes << Node{
733                             typ: .word_char
734                         }
735                     }
736                     `W` {
737                         parsed_nodes << Node{
738                             typ: .non_word_char
739                         }
740                     }
741                     `d` {
742                         parsed_nodes << Node{
743                             typ: .digit
744                         }
745                     }
746                     `D` {
747                         parsed_nodes << Node{
748                             typ: .non_digit
749                         }
750                     }
751                     `s` {
752                         parsed_nodes << Node{
753                             typ: .whitespace
754                         }
755                     }
756                     `S` {
757                         parsed_nodes << Node{
758                             typ: .non_whitespace
759                         }
760                     }
761                     `b` {
762                         parsed_nodes << Node{
763                             typ: .word_boundary
764                         }
765                     }
766                     `B` {
767                         parsed_nodes << Node{
768                             typ: .non_word_boundary
769                         }
770                     }
771                     `a` {
772                         parsed_nodes << Node{
773                             typ: .lowercase_char
774                         }
775                     }
776                     `A` {
777                         parsed_nodes << Node{
778                             typ: .uppercase_char
779                         }
780                     }
781                     `x` {
782                         // \xHH - two hex digits decode to a character
783                         if pos + 2 > pattern.len {
784                             return error('\\x requires exactly 2 hex digits')
785                         }
786                         hex_str := pattern[pos..pos + 2]
787                         val := strconv.parse_uint(hex_str, 16, 32) or {
788                             return error('Invalid hex escape \\x${hex_str}')
789                         }
790                         pos += 2
791                         parsed_nodes << Node{
792                             typ:         .chr
793                             chr:         rune(val)
794                             ignore_case: current_flags.ignore_case
795                         }
796                     }
797                     `X` {
798                         // \XHHHH - four hex digits decode to a Unicode codepoint
799                         if pos + 4 > pattern.len {
800                             return error('\\X requires exactly 4 hex digits')
801                         }
802                         hex_str := pattern[pos..pos + 4]
803                         val := strconv.parse_uint(hex_str, 16, 32) or {
804                             return error('Invalid hex escape \\X${hex_str}')
805                         }
806                         pos += 4
807                         parsed_nodes << Node{
808                             typ:         .chr
809                             chr:         rune(val)
810                             ignore_case: current_flags.ignore_case
811                         }
812                     }
813                     else {
814                         parsed_nodes << Node{
815                             typ:         .chr
816                             chr:         esc
817                             ignore_case: current_flags.ignore_case
818                         }
819                     }
820                 }
821             }
822             else {
823                 pos += char_len
824                 parsed_nodes << Node{
825                     typ:         .chr
826                     chr:         chr
827                     ignore_case: current_flags.ignore_case
828                 }
829             }
830         }
831 
832         if parsed_nodes.len > 0 {
833             mut q := Quantifier{1, 1, true}
834             if pos < pattern.len {
835                 peek := pattern[pos]
836                 match peek {
837                     `*` {
838                         q = Quantifier{0, -1, true}
839                         pos++
840                     }
841                     `+` {
842                         q = Quantifier{1, -1, true}
843                         pos++
844                     }
845                     `?` {
846                         q = Quantifier{0, 1, true}
847                         pos++
848                     }
849                     `{` {
850                         end := pattern.index_after('}', pos) or { -1 }
851                         if end == -1 {
852                             return error('Unclosed quantifier')
853                         }
854                         parts := pattern[pos + 1..end].split(',')
855                         min := parts[0].int()
856                         max := if parts.len > 1 {
857                             if parts[1] == '' { -1 } else { parts[1].int() }
858                         } else {
859                             min
860                         }
861                         if min < 0 || (max != -1 && max < min) {
862                             return error('Invalid quantifier range {${min},${max}}')
863                         }
864                         q = Quantifier{min, max, true}
865                         pos = end + 1
866                     }
867                     else {}
868                 }
869 
870                 if pos < pattern.len && pattern[pos] == `?` {
871                     q.greedy = false
872                     pos++
873                 }
874             }
875             parsed_nodes.last().quant = q
876         }
877         current_sequence << parsed_nodes
878     }
879 
880     if alternatives.len > 0 {
881         if current_sequence.len == 0 {
882             return error('Empty alternative at end of pattern')
883         }
884         alternatives << current_sequence
885         return [
886             Node{
887                 typ:          .alternation
888                 alternatives: alternatives
889                 quant:        Quantifier{1, 1, true}
890             },
891         ], pos, group_counter
892     }
893     return current_sequence, pos, group_counter
894 }
895 
896 /******************************************************************************
897 *
898 * Virtual Machine Execution Engine (Highly Optimized)
899 *
900 ******************************************************************************/
901 
902 // vm_match executes the bytecode against the input string using the provided Machine state.
903 // OPTIMIZATION: Uses raw pointers for instruction and stack access to bypass bounds checking.
904 @[direct_array_access]
905 fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match {
906     unsafe {
907         // Optimization: Cast voidptr to typed pointer for direct indexing
908         mut cap_ptr := &int(m.captures.data)
909         cap_len := m.captures.len
910 
911         // Fast clear of captures using pointer arithmetic (memset-like)
912         for i := 0; i < cap_len; i++ {
913             cap_ptr[i] = -1
914         }
915 
916         mut sp := start_pos // String Pointer Index
917         mut stack_ptr := 0 // Stack Pointer Offset
918 
919         cap_size := r.total_groups * 2
920         frame_size := cap_size + 2 // [captures..., saved_sp, saved_pc]
921 
922         // Raw pointers for hot path access
923         prog_start := &Inst(r.prog.data)
924         mut inst_ptr := prog_start // PC as a pointer
925 
926         str_ptr := text.str
927         str_len := text.len
928 
929         // Cache stack data pointer (cast to typed pointer)
930         mut stack_data := &int(m.stack.data)
931         mut stack_max := m.stack.len
932 
933         for {
934             // Check if we walked off the program (should be caught by match inst)
935             // Using pointer arithmetic: offset = (inst_ptr - prog_start)
936 
937             match inst_ptr.typ {
938                 .match {
939                     // Only allocate result strings on successful match
940                     mut s_groups := []string{cap: r.total_groups}
941                     for i := 0; i < r.total_groups; i++ {
942                         s, e := cap_ptr[i * 2], cap_ptr[i * 2 + 1]
943                         s_groups << if s != -1 && e >= s { text[s..e] } else { '' }
944                     }
945                     return Match{
946                         text:   text[start_pos..sp]
947                         start:  start_pos
948                         end:    sp
949                         groups: s_groups
950                     }
951                 }
952                 .char {
953                     if sp >= str_len {
954                         goto backtrack
955                     }
956                     curr_byte := str_ptr[sp]
957 
958                     // Fast ASCII path
959                     if curr_byte < 128 && inst_ptr.val < 128 {
960                         mut c1, mut c2 := curr_byte, u8(inst_ptr.val)
961                         if inst_ptr.ignore_case {
962                             if c1 >= `a` && c1 <= `z` {
963                                 c1 -= 32
964                             }
965                             if c2 >= `a` && c2 <= `z` {
966                                 c2 -= 32
967                             }
968                         }
969                         if c1 == c2 {
970                             sp++
971                             inst_ptr++
972                             continue
973                         }
974                         goto backtrack
975                     }
976 
977                     // UTF-8 Path
978                     rn, l := read_rune_at(str_ptr, str_len, sp)
979                     if l == 0 {
980                         goto backtrack
981                     }
982 
983                     mut match_ok := false
984                     if inst_ptr.ignore_case {
985                         r1 := if rn >= `a` && rn <= `z` { rn - 32 } else { rn }
986                         r2 := if inst_ptr.val >= `a` && inst_ptr.val <= `z` {
987                             inst_ptr.val - 32
988                         } else {
989                             inst_ptr.val
990                         }
991                         if r1 == r2 {
992                             match_ok = true
993                         }
994                     } else if rn == inst_ptr.val {
995                         match_ok = true
996                     }
997 
998                     if match_ok {
999                         sp += l
1000                         inst_ptr++
1001                     } else {
1002                         goto backtrack
1003                     }
1004                 }
1005                 .string {
1006                     if sp + inst_ptr.val_len > str_len {
1007                         goto backtrack
1008                     }
1009                     // Inline memcmp
1010                     v_str := inst_ptr.val_str.str
1011                     for i in 0 .. inst_ptr.val_len {
1012                         if str_ptr[sp + i] != v_str[i] {
1013                             goto backtrack
1014                         }
1015                     }
1016                     sp += inst_ptr.val_len
1017                     inst_ptr++
1018                 }
1019                 .class {
1020                     if sp >= str_len {
1021                         goto backtrack
1022                     }
1023                     c_byte := str_ptr[sp]
1024                     mut matched := false
1025                     mut cl := 1
1026 
1027                     // Optimization: Bitmap lookup for ASCII
1028                     if c_byte < 128 {
1029                         if (inst_ptr.bitmap[c_byte >> 5] & (u32(1) << (c_byte & 31))) != 0 {
1030                             matched = true
1031                         }
1032                     } else {
1033                         rn, l := read_rune_at(str_ptr, str_len, sp)
1034                         cl = l
1035                         for cr in inst_ptr.char_class {
1036                             if cr == rn {
1037                                 matched = true
1038                                 break
1039                             }
1040                         }
1041                     }
1042 
1043                     if matched != inst_ptr.inverted {
1044                         sp += cl
1045                         inst_ptr++
1046                     } else {
1047                         goto backtrack
1048                     }
1049                 }
1050                 .any {
1051                     if sp >= str_len {
1052                         goto backtrack
1053                     }
1054                     if inst_ptr.dot_all || str_ptr[sp] != `\n` {
1055                         _, cl := read_rune_at(str_ptr, str_len, sp)
1056                         sp += cl
1057                         inst_ptr++
1058                     } else {
1059                         goto backtrack
1060                     }
1061                 }
1062                 .save {
1063                     cap_ptr[inst_ptr.group_idx] = sp
1064                     inst_ptr++
1065                 }
1066                 .split {
1067                     if stack_ptr + frame_size >= stack_max {
1068                         new_size := stack_max * 2
1069                         if new_size > r.max_stack_depth {
1070                             goto backtrack
1071                         }
1072                         m.stack.grow_len(new_size)
1073                         stack_data = &int(m.stack.data) // Pointer might change on realloc
1074                         stack_max = new_size
1075                     }
1076 
1077                     // Optimization: Unrolled stack push
1078                     stack_offset := stack_ptr
1079                     for i in 0 .. cap_size {
1080                         stack_data[stack_offset + i] = cap_ptr[i]
1081                     }
1082                     stack_data[stack_offset + cap_size] = sp
1083                     // Save backtrack target PC index
1084                     stack_data[stack_offset + cap_size + 1] = inst_ptr.target_y
1085 
1086                     stack_ptr += frame_size
1087                     // Jump to primary target (convert index to pointer)
1088                     // FIX: Use pointer indexing instead of addition to avoid type mismatch
1089                     inst_ptr = &prog_start[inst_ptr.target_x]
1090                 }
1091                 .jmp {
1092                     inst_ptr = &prog_start[inst_ptr.target_x]
1093                 }
1094                 .assert_start {
1095                     if sp == 0 {
1096                         inst_ptr++
1097                     } else {
1098                         goto backtrack
1099                     }
1100                 }
1101                 .assert_end {
1102                     if sp == str_len {
1103                         inst_ptr++
1104                     } else {
1105                         goto backtrack
1106                     }
1107                 }
1108                 .assert_line_start {
1109                     if sp == 0 || (sp > 0 && str_ptr[sp - 1] == `\n`) {
1110                         inst_ptr++
1111                     } else {
1112                         goto backtrack
1113                     }
1114                 }
1115                 .assert_line_end {
1116                     if sp == str_len || str_ptr[sp] == `\n` {
1117                         inst_ptr++
1118                     } else {
1119                         goto backtrack
1120                     }
1121                 }
1122                 .assert_bound, .assert_nbound {
1123                     l := if sp > 0 { is_word_char(str_ptr[sp - 1]) } else { false }
1124                     r_ := if sp < str_len { is_word_char(str_ptr[sp]) } else { false }
1125                     if (inst_ptr.typ == .assert_bound && l != r_)
1126                         || (inst_ptr.typ == .assert_nbound && l == r_) {
1127                         inst_ptr++
1128                     } else {
1129                         goto backtrack
1130                     }
1131                 }
1132             }
1133 
1134             continue
1135 
1136             backtrack:
1137             if stack_ptr <= 0 {
1138                 return none
1139             }
1140 
1141             stack_ptr -= frame_size
1142             stack_offset := stack_ptr
1143 
1144             // Restore captures
1145             for i in 0 .. cap_size {
1146                 cap_ptr[i] = stack_data[stack_offset + i]
1147             }
1148             sp = stack_data[stack_offset + cap_size]
1149             // Restore PC from index (explicit cast for pointer arithmetic)
1150             // FIX: Use pointer indexing instead of addition
1151             inst_ptr = &prog_start[stack_data[stack_offset + cap_size + 1]]
1152         }
1153     }
1154     return none
1155 }
1156 
1157 /******************************************************************************
1158 *
1159 * Public API
1160 *
1161 ******************************************************************************/
1162 
1163 // find returns the first match of the regex in text.
1164 pub fn (r &Regex) find(text string) ?Match {
1165     return r.find_from(text, 0)
1166 }
1167 
1168 // find_from returns the first match starting from start_index.
1169 // Optimized with fast prefix skipping and anchor checks.
1170 pub fn (r &Regex) find_from(text string, start_index int) ?Match {
1171     if start_index < 0 || start_index > text.len {
1172         return none
1173     }
1174     mut m := r.new_machine()
1175 
1176     // Optimization: Anchored pattern (^) only checks the start
1177     if r.anchored {
1178         if start_index == 0 {
1179             return r.vm_match(text, 0, mut m)
1180         }
1181         // If multiline mode is NOT enabled, ^ only matches index 0.
1182         // If multiline is enabled, we need to check every newline, handled by logic below.
1183         // Note: The compiler sets anchored=true only for ^ at start.
1184         // If multiline flag is set dynamically inside pattern, strict anchoring logic might differ,
1185         // but standard ^ usage benefits here.
1186     }
1187 
1188     // Optimization: Boyer-Moore-like literal prefix skip
1189     if r.has_prefix {
1190         mut i := text.index_after(r.prefix_lit, start_index) or { -1 }
1191         for i != -1 {
1192             if res := r.vm_match(text, i, mut m) {
1193                 return res
1194             }
1195             i = text.index_after(r.prefix_lit, i + 1) or { -1 }
1196         }
1197         return none
1198     }
1199 
1200     for i := start_index; i <= text.len; i++ {
1201         // Skip UTF-8 continuation bytes to ensure we only match at rune boundaries
1202         // 0xC0 (11000000) is the start of a multi-byte sequence, 0x80 (10000000) is a continuation
1203         if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1204             continue
1205         }
1206         if res := r.vm_match(text, i, mut m) {
1207             return res
1208         }
1209     }
1210     return none
1211 }
1212 
1213 // find_all returns all non-overlapping matches in text.
1214 pub fn (r &Regex) find_all(text string) []Match {
1215     mut matches := []Match{}
1216     mut m := r.new_machine()
1217     mut i := 0
1218     for i <= text.len {
1219         if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1220             i++
1221             continue
1222         }
1223         if res := r.vm_match(text, i, mut m) {
1224             matches << res
1225             if res.end > i {
1226                 i = res.end
1227             } else {
1228                 // Empty match: advance by one full rune to avoid infinite loop
1229                 _, rune_len := read_rune_at(text.str, text.len, i)
1230                 i += if rune_len > 0 { rune_len } else { 1 }
1231             }
1232         } else {
1233             i++
1234         }
1235     }
1236     return matches
1237 }
1238 
1239 // replace finds the first match and replaces it using repl. Supports $1, $2 backreferences.
1240 pub fn (r &Regex) replace(text string, repl string) string {
1241     res := r.find(text) or { return text }
1242     mut sb := strings.new_builder(text.len + repl.len)
1243     sb.write_string(text[0..res.start])
1244     mut i := 0
1245     for i < repl.len {
1246         if repl[i] == `$` && i + 1 < repl.len {
1247             next_char := repl[i + 1]
1248             if next_char >= `0` && next_char <= `9` {
1249                 d := next_char - `0` - 1
1250                 if d >= 0 && d < res.groups.len {
1251                     sb.write_string(res.groups[d])
1252                 }
1253                 i += 2
1254                 continue
1255             }
1256         }
1257         sb.write_u8(repl[i])
1258         i++
1259     }
1260     sb.write_string(text[res.end..])
1261     return sb.str()
1262 }
1263 
1264 // fullmatch returns a Match only if the pattern matches the entire text.
1265 pub fn (r &Regex) fullmatch(text string) ?Match {
1266     mut m := r.new_machine()
1267     if res := r.vm_match(text, 0, mut m) {
1268         if res.end == text.len {
1269             return res
1270         }
1271     }
1272     return none
1273 }
1274 
1275 // group_by_name retrieves text captured by a named group.
1276 pub fn (r &Regex) group_by_name(m Match, name string) string {
1277     idx := r.group_map[name] or { return '' }
1278     return if idx < m.groups.len { m.groups[idx] } else { '' }
1279 }
1280 
1281 /******************************************************************************
1282 *
1283 * Compatibility Layer
1284 *
1285 ******************************************************************************/
1286 
1287 // new_regex is a helper wrapper to compile a regex pattern.
1288 pub fn new_regex(pattern string, _ int) !Regex {
1289     return compile(pattern)
1290 }
1291 
1292 // match_str is a compatibility alias for find_from.
1293 pub fn (r &Regex) match_str(text string, start_index int, _ int) ?Match {
1294     return r.find_from(text, start_index)
1295 }
1296 
1297 // get returns the matched text for a specific group index.
1298 // Index 0 returns the full match, 1..n returns capture groups.
1299 pub fn (m Match) get(idx int) ?string {
1300     if idx == 0 {
1301         return m.text
1302     }
1303     if idx > 0 && idx <= m.groups.len {
1304         return m.groups[idx - 1]
1305     }
1306     return none
1307 }
1308 
1309 // get_all returns a list of all captured strings, starting with the full match at index 0.
1310 pub fn (m Match) get_all() []string {
1311     mut res := [m.text]
1312     res << m.groups
1313     return res
1314 }
1315