Gitly


1 /*
2 regex2 0.9.6 beta (VM Edition) - Performance Optimized
3 
4 Copyright (c) 2026 Dario Deledda. All rights reserved.
5 Use of this source code is governed by an MIT license
6 that can be found in the LICENSE file.
7 
8 This file contains a regex module based on a Virtual Machine approach.
9 
10 Features:
11  - **Non-recursive VM**: Safe execution without stack overflow on complex patterns.
12  - UTF8 support with Fast ASCII path.
13  - Literal characters, '.', '*', '{m,n}'.
14  - Short quantifiers: '?', '+'.
15  - Non-greedy quantifiers: '*?', '+?', '??'.
16  - Nested groups: '()'.
17  - Named groups: '(?P<name>...)'.
18  - Non-capturing groups: '(?:...)'.
19  - Character classes: \w, \W, \d, \D, \s, \S, \a, \A.
20  - Hex escapes: \xHH, \XHHHH.
21  - Alternation: '|'.
22  - Character classes: [abc], [^abc], [a-z].
23  - Anchors: '^' (start of string), '$' (end of string).
24  - Word boundaries: '\b' (word boundary), '\B' (non-word boundary).
25  - Inline Flags:
26    - '(?i)' Case insensitive.
27    - '(?m)' Multiline (anchors match newlines).
28    - '(?s)' Dot-all (dot matches newline).
29 
30 Configuration:
31  - `max_stack_depth`: Controls the dynamic growth limit of the backtracking stack.
32     Default is 2048. Increase this value if you encounter complex patterns failing
33     on deep recursions/backtracking, or decrease it to limit memory usage.
34 
35 Functions:
36  - `compile(pattern) !Regex` -> Compiles a pattern into a Regex object.
37  - `(r Regex) find(text) ?Match` -> Finds the first match in a string.
38  - `(r Regex) find_all(text) []Match` -> Finds all non-overlapping matches.
39  - `(r Regex) find_from(text, start_index) ?Match` -> Finds the first match starting from a specific index.
40  - `(r Regex) replace(text, repl) string` -> Finds and replaces the first match.
41  - `(r Regex) fullmatch(text string) ?Match` -> Match exact strings.
42  - `(r Regex) group_by_name(m Match, name string) string` -> Get text of a named group.
43 
44 Key Architectural Features and Optimizations:
45  - **Pre-allocated Machine workspace**: All search operations are zero-allocation.
46  - **Raw pointer access**: Bypasses bounds checking for string and instruction access.
47  - **Fast ASCII Path**: Characters < 128 bypass heavy UTF-8 decoding.
48  - **Instruction merging**: Consecutive character matches are merged into string blocks.
49  - **Bitmap lookups**: ASCII character classes use a 128-bit bitset for O(1) matching.
50  - **NFA Virtual Machine**: Executes bytecode instructions to simulate pattern matching.
51  - **Dynamic Stack Growth**: Automatically expands the backtracking stack to prevent false negatives.
52  - **Zero-Allocation Search**: Reuses a pre-allocated Machine workspace for search operations.
53  - **Anchored Optimization**: Patterns starting with '^' skip the scanning loop.
54  - **Prefix Skipping**: Uses Boyer-Moore-like skipping for literal prefixes.
55 
56 */
57 
58 module pcre
59 
60 import strings
61 
62 /******************************************************************************
63 *
64 * VM Instruction Definition
65 *
66 ******************************************************************************/
67 
68 // InstType defines the operation codes for the Virtual Machine.
69 enum InstType as u8 {
70     match             // Halt and signal a successful match.
71     char              // Match a single UTF-8 rune.
72     string            // Match a sequence of ASCII characters (merged optimization).
73     any               // Match any character (dot).
74     class             // Match a character class (e.g., [a-z] or \d).
75     split             // Branch execution: target_x (primary), target_y (backtrack).
76     jmp               // Unconditional jump to target_x.
77     save              // Save current string position into a capture group index.
78     assert_start      // Assert current position is start of string.
79     assert_end        // Assert current position is end of string.
80     assert_line_start // Assert current position is start of a line (multiline mode).
81     assert_line_end   // Assert current position is end of a line (multiline mode).
82     assert_bound      // Assert word boundary (\b).
83     assert_nbound     // Assert non-word boundary (\B).
84 }
85 
86 // Inst represents a single bytecode instruction.
87 // Packed for memory locality.
88 struct Inst {
89 mut:
90     typ         InstType
91     val         rune   // Used by .char
92     val_str     string // Used by .string
93     val_len     int    // Byte length of val_str
94     target_x    int    // Primary jump/split target
95     target_y    int    // Backtrack target for .split
96     group_idx   int    // Capture group index for .save
97     char_class  []rune // Literal runes for character classes
98     bitmap      [4]u32 // 128-bit bitset for ASCII character classes
99     inverted    bool   // Negation flag for classes
100     ignore_case bool   // Case-insensitive flag
101     dot_all     bool   // Whether '.' matches '\n'
102 }
103 
104 // Machine provides a workspace for VM execution.
105 // To ensure thread safety, this is created per top-level API call.
106 pub struct Machine {
107 mut:
108     stack    []int // Backtracking stack stores: [capture_states..., string_ptr, next_pc]
109     captures []int // Flat array of [start, end] byte indices for groups
110 }
111 
112 // Regex is the compiled regular expression object.
113 pub struct Regex {
114 pub:
115     pattern      string         // The original regex string
116     prog         []Inst         // Compiled bytecode
117     total_groups int            // Number of capture groups defined in pattern
118     group_map    map[string]int // Mapping of names to indices for (?P<name>...)
119     prefix_lit   string         // Pre-calculated literal prefix for fast-skip optimization
120     has_prefix   bool           // Whether a literal prefix exists
121     anchored     bool           // True if pattern starts with '^' (optimization hint)
122 pub mut:
123     max_stack_depth int // User-defined stack limit hint
124 }
125 
126 // Match contains the results of a successful regex match.
127 pub struct Match {
128 pub:
129     text   string   // The full text of the match
130     start  int      // Starting byte index in the source string
131     end    int      // Ending byte index in the source string
132     groups []string // Sub-strings captured by groups
133 }
134 
135 // Internal structures for Compilation
136 struct Quantifier {
137 mut:
138     min    int
139     max    int // -1 for infinity
140     greedy bool
141 }
142 
143 struct Flags {
144 mut:
145     ignore_case bool
146     multiline   bool
147     dot_all     bool
148 }
149 
150 enum NodeType {
151     chr
152     any_char
153     group
154     alternation
155     char_class
156     start_of_string
157     end_of_string
158     word_boundary
159     non_word_boundary
160     word_char
161     non_word_char
162     digit
163     non_digit
164     whitespace
165     non_whitespace
166     lowercase_char
167     uppercase_char
168 }
169 
170 struct Node {
171 mut:
172     typ                 NodeType
173     chr                 rune
174     quant               Quantifier
175     nodes               []Node
176     alternatives        [][]Node
177     group_capture_index int = -1
178     char_set            []rune
179     inverted            bool
180     ignore_case         bool
181     multiline           bool
182     dot_all             bool
183 }
184 
185 /******************************************************************************
186 *
187 * Internal Utilities (Inlined for Speed)
188 *
189 ******************************************************************************/
190 
191 // read_rune_at decodes a UTF-8 rune from a byte pointer safely.
192 // Marked inline to be embedded directly into the VM loop.
193 @[inline]
194 fn read_rune_at(str &u8, len int, index int) (rune, int) {
195     unsafe {
196         if index >= len {
197             return 0, 0
198         }
199         b0 := u32(str[index])
200         if b0 < 0x80 {
201             return rune(b0), 1
202         }
203         if (b0 & 0xE0) == 0xC0 && index + 1 < len {
204             return rune(((b0 & 0x1F) << 6) | (u32(str[index + 1]) & 0x3F)), 2
205         }
206         if (b0 & 0xF0) == 0xE0 && index + 2 < len {
207             return rune(((b0 & 0x0F) << 12) | ((u32(str[index + 1]) & 0x3F) << 6) | (u32(str[
208                 index + 2]) & 0x3F)), 3
209         }
210         if (b0 & 0xF8) == 0xF0 && index + 3 < len {
211             return rune(((b0 & 0x07) << 18) | ((u32(str[index + 1]) & 0x3F) << 12) | ((u32(str[
212                 index + 2]) & 0x3F) << 6) | (u32(str[index + 3]) & 0x3F)), 4
213         }
214     }
215     return 0, 0
216 }
217 
218 // is_word_char returns true for alphanumeric ASCII characters and underscore.
219 @[inline]
220 fn is_word_char(r u8) bool {
221     return (r >= `a` && r <= `z`) || (r >= `A` && r <= `Z`) || (r >= `0` && r <= `9`) || r == `_`
222 }
223 
224 // set_bitmap sets a specific bit in a 128-bit bitset for ASCII matching.
225 @[inline]
226 fn set_bitmap(mut bitmap [4]u32, r rune) {
227     if r >= 0 && r < 128 {
228         idx := u32(r) >> 5
229         bit := u32(r) & 31
230         bitmap[idx] |= (u32(1) << bit)
231     }
232 }
233 
234 /******************************************************************************
235 *
236 * Compiler Logic
237 *
238 ******************************************************************************/
239 
240 // compile transforms a regex pattern string into a Regex object.
241 pub fn compile(pattern string) !Regex {
242     mut group_map := map[string]int{}
243     initial_flags := Flags{false, false, false}
244 
245     // Phase 1: AST Parsing
246     nodes, _, final_group_count := parse_nodes(pattern, 0, `\0`, 0, initial_flags, mut group_map)!
247 
248     root := Node{
249         typ:   .group
250         nodes: nodes
251         quant: Quantifier{1, 1, true}
252     }
253 
254     // Phase 2: Bytecode Emission
255     mut compiler := Compiler{
256         prog: []Inst{cap: 128}
257     }
258     compiler.emit_node(root)
259     compiler.emit(Inst{ typ: .match })
260 
261     // Phase 3: Optimization
262     optimized_prog := compiler.optimize()
263 
264     // Detect Prefix and Anchor optimizations
265     mut prefix := ''
266     mut has_prefix := false
267     mut anchored := false
268 
269     if optimized_prog.len > 0 {
270         first := optimized_prog[0]
271         if first.typ == .string {
272             prefix = first.val_str
273             has_prefix = true
274         } else if first.typ == .char && !first.ignore_case && first.val < 128 {
275             prefix = unsafe { u8(first.val).ascii_str() }
276             has_prefix = true
277         } else if first.typ == .assert_start {
278             anchored = true
279         }
280     }
281 
282     return Regex{
283         max_stack_depth: 2048
284         pattern:         pattern
285         prog:            optimized_prog
286         total_groups:    final_group_count
287         group_map:       group_map
288         prefix_lit:      prefix
289         has_prefix:      has_prefix
290         anchored:        anchored
291     }
292 }
293 
294 // new_machine allocates a new VM state machine.
295 // This isolates the runtime memory (stack/captures) from the compiled regex, allowing thread-safe usage.
296 pub fn (r &Regex) new_machine() Machine {
297     // Pre-allocate enough space for stack and captures to avoid re-allocation in hot path
298     return Machine{
299         stack:    []int{len: r.max_stack_depth}
300         captures: []int{len: r.total_groups * 2}
301     }
302 }
303 
304 // Compiler holds the state for generating the bytecode instructions.
305 struct Compiler {
306 mut:
307     prog []Inst
308 }
309 
310 // emit appends an instruction to the program and returns its index.
311 fn (mut c Compiler) emit(i Inst) int {
312     c.prog << i
313     return c.prog.len - 1
314 }
315 
316 // optimize merges consecutive literal characters into single string instructions
317 // and resolves jump targets to absolute indices.
318 fn (mut c Compiler) optimize() []Inst {
319     mut targets := map[int]bool{}
320     for inst in c.prog {
321         if inst.typ == .split || inst.typ == .jmp {
322             targets[inst.target_x] = true
323             targets[inst.target_y] = true
324         }
325     }
326 
327     mut new_prog := []Inst{cap: c.prog.len}
328     mut idx_map := map[int]int{}
329 
330     mut i := 0
331     for i < c.prog.len {
332         inst := c.prog[i]
333         idx_map[i] = new_prog.len
334 
335         // Optimization: Merge consecutive chars
336         if inst.typ == .char && !inst.ignore_case && inst.val < 128 {
337             mut s_val := unsafe { u8(inst.val).ascii_str() }
338             mut j := i + 1
339             for j < c.prog.len && !targets[j] {
340                 next := c.prog[j]
341                 if next.typ == .char && !next.ignore_case && next.val < 128 {
342                     s_val += unsafe { u8(next.val).ascii_str() }
343                     j++
344                 } else {
345                     break
346                 }
347             }
348             if j > i + 1 {
349                 new_prog << Inst{
350                     typ:     .string
351                     val_str: s_val
352                     val_len: s_val.len
353                 }
354                 for k := i + 1; k < j; k++ {
355                     idx_map[k] = new_prog.len
356                 }
357                 i = j
358                 continue
359             }
360         }
361         new_prog << inst
362         i++
363     }
364     idx_map[c.prog.len] = new_prog.len
365 
366     // Fix jump offsets
367     for mut inst in new_prog {
368         if inst.typ == .split || inst.typ == .jmp {
369             inst.target_x = idx_map[inst.target_x] or { inst.target_x }
370             inst.target_y = idx_map[inst.target_y] or { inst.target_y }
371         }
372     }
373     return new_prog
374 }
375 
376 // emit_class generates the instructions for a character class node.
377 // It populates the bitmap for O(1) ASCII matching and the slice for Unicode.
378 fn (mut c Compiler) emit_class(node Node) {
379     mut bitmap := [4]u32{}
380     mut char_class := node.char_set.clone()
381     mut inverted := node.inverted
382 
383     // Pre-compile common classes into the bitmap for O(1) lookups
384     match node.typ {
385         .word_char {
386             for r := `0`; r <= `9`; r++ {
387                 set_bitmap(mut bitmap, r)
388             }
389             for r := `a`; r <= `z`; r++ {
390                 set_bitmap(mut bitmap, r)
391             }
392             for r := `A`; r <= `Z`; r++ {
393                 set_bitmap(mut bitmap, r)
394             }
395             set_bitmap(mut bitmap, `_`)
396         }
397         .non_word_char {
398             inverted = true
399             for r := `0`; r <= `9`; r++ {
400                 set_bitmap(mut bitmap, r)
401             }
402             for r := `a`; r <= `z`; r++ {
403                 set_bitmap(mut bitmap, r)
404             }
405             for r := `A`; r <= `Z`; r++ {
406                 set_bitmap(mut bitmap, r)
407             }
408             set_bitmap(mut bitmap, `_`)
409         }
410         .digit {
411             for r := `0`; r <= `9`; r++ {
412                 set_bitmap(mut bitmap, r)
413             }
414         }
415         .non_digit {
416             inverted = true
417             for r := `0`; r <= `9`; r++ {
418                 set_bitmap(mut bitmap, r)
419             }
420         }
421         .whitespace {
422             for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
423                 set_bitmap(mut bitmap, r)
424             }
425         }
426         .non_whitespace {
427             inverted = true
428             for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
429                 set_bitmap(mut bitmap, r)
430             }
431         }
432         .lowercase_char {
433             for r := `a`; r <= `z`; r++ {
434                 set_bitmap(mut bitmap, r)
435             }
436         }
437         .uppercase_char {
438             for r := `A`; r <= `Z`; r++ {
439                 set_bitmap(mut bitmap, r)
440             }
441         }
442         .char_class {
443             for r in node.char_set {
444                 set_bitmap(mut bitmap, r)
445                 if node.ignore_case {
446                     if r >= `a` && r <= `z` {
447                         set_bitmap(mut bitmap, r - 32)
448                     } else if r >= `A` && r <= `Z` {
449                         set_bitmap(mut bitmap, r + 32)
450                     }
451                 }
452             }
453         }
454         else {}
455     }
456 
457     c.emit(Inst{
458         typ:         .class
459         char_class:  char_class
460         bitmap:      bitmap
461         inverted:    inverted
462         ignore_case: node.ignore_case
463     })
464 }
465 
466 // emit_node handles quantifiers and loops, delegating the actual logic to emit_logic.
467 fn (mut c Compiler) emit_node(node Node) {
468     for _ in 0 .. node.quant.min {
469         c.emit_logic(node)
470     }
471 
472     if node.quant.max == -1 {
473         split_idx := c.emit(Inst{ typ: .split })
474         start_pc := c.prog.len
475         c.emit_logic(node)
476         c.emit(Inst{ typ: .jmp, target_x: split_idx })
477         if node.quant.greedy {
478             c.prog[split_idx].target_x = start_pc
479             c.prog[split_idx].target_y = c.prog.len
480         } else {
481             c.prog[split_idx].target_x = c.prog.len
482             c.prog[split_idx].target_y = start_pc
483         }
484     } else if node.quant.max > node.quant.min {
485         rem := node.quant.max - node.quant.min
486         mut splits := []int{}
487         for _ in 0 .. rem {
488             idx := c.emit(Inst{ typ: .split })
489             match_pc := c.prog.len
490             if node.quant.greedy {
491                 c.prog[idx].target_x = match_pc
492             } else {
493                 c.prog[idx].target_y = match_pc
494             }
495             c.emit_logic(node)
496             splits << idx
497         }
498         end_pc := c.prog.len
499         for idx in splits {
500             if node.quant.greedy {
501                 c.prog[idx].target_y = end_pc
502             } else {
503                 c.prog[idx].target_x = end_pc
504             }
505         }
506     }
507 }
508 
509 // emit_logic generates instructions for specific node types (char, group, alternation).
510 fn (mut c Compiler) emit_logic(node Node) {
511     match node.typ {
512         .chr {
513             c.emit(Inst{ typ: .char, val: node.chr, ignore_case: node.ignore_case })
514         }
515         .any_char {
516             c.emit(Inst{ typ: .any, dot_all: node.dot_all })
517         }
518         .group {
519             if node.group_capture_index != -1 {
520                 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 })
521             }
522             for child in node.nodes {
523                 c.emit_node(child)
524             }
525             if node.group_capture_index != -1 {
526                 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 + 1 })
527             }
528         }
529         .alternation {
530             if node.alternatives.len == 0 {
531                 return
532             }
533             mut end_jumps := []int{}
534             for i := 0; i < node.alternatives.len - 1; i++ {
535                 split_idx := c.emit(Inst{ typ: .split })
536                 c.prog[split_idx].target_x = c.prog.len
537                 for child in node.alternatives[i] {
538                     c.emit_node(child)
539                 }
540                 end_jumps << c.emit(Inst{ typ: .jmp })
541                 c.prog[split_idx].target_y = c.prog.len
542             }
543             for child in node.alternatives.last() {
544                 c.emit_node(child)
545             }
546             for idx in end_jumps {
547                 c.prog[idx].target_x = c.prog.len
548             }
549         }
550         .start_of_string {
551             c.emit(Inst{
552                 typ: if node.multiline { InstType.assert_line_start } else { InstType.assert_start }
553             })
554         }
555         .end_of_string {
556             c.emit(Inst{
557                 typ: if node.multiline { InstType.assert_line_end } else { InstType.assert_end }
558             })
559         }
560         .word_boundary {
561             c.emit(Inst{ typ: .assert_bound })
562         }
563         .non_word_boundary {
564             c.emit(Inst{ typ: .assert_nbound })
565         }
566         else {
567             c.emit_class(node)
568         }
569     }
570 }
571 
572 // parse_nodes implements a recursive descent parser to construct the AST from the pattern string.
573 fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_start int, passed_flags Flags, mut group_map map[string]int) !([]Node, int, int) {
574     mut pos := pos_start
575     mut group_counter := group_counter_start
576     mut current_flags := passed_flags
577     mut alternatives := [][]Node{}
578     mut current_sequence := []Node{}
579 
580     for pos < pattern.len {
581         chr, char_len := read_rune_at(pattern.str, pattern.len, pos)
582         if chr == terminator {
583             pos += char_len
584             if alternatives.len > 0 {
585                 alternatives << current_sequence
586                 return [
587                     Node{
588                         typ:          .alternation
589                         alternatives: alternatives
590                         quant:        Quantifier{1, 1, true}
591                     },
592                 ], pos, group_counter
593             }
594             return current_sequence, pos, group_counter
595         }
596         if chr == `|` {
597             pos += char_len
598             if current_sequence.len == 0 {
599                 return error('Empty alternative')
600             }
601             alternatives << current_sequence
602             current_sequence = []Node{}
603             continue
604         }
605 
606         mut parsed_nodes := []Node{}
607         match chr {
608             `^` {
609                 pos += char_len
610                 parsed_nodes << Node{
611                     typ:       .start_of_string
612                     multiline: current_flags.multiline
613                 }
614             }
615             `$` {
616                 pos += char_len
617                 parsed_nodes << Node{
618                     typ:       .end_of_string
619                     multiline: current_flags.multiline
620                 }
621             }
622             `.` {
623                 pos += char_len
624                 parsed_nodes << Node{
625                     typ:     .any_char
626                     dot_all: current_flags.dot_all
627                 }
628             }
629             `*`, `+`, `?`, `{` {
630                 return error('Quantifier must follow a token')
631             }
632             `(` {
633                 pos += char_len
634                 mut idx := -1
635                 mut cap := true
636                 if pos < pattern.len && pattern[pos] == `?` {
637                     pos++
638                     if pos < pattern.len && pattern[pos] in [`i`, `m`, `s`] {
639                         for pos < pattern.len && pattern[pos] != `)` {
640                             match pattern[pos] {
641                                 `i` { current_flags.ignore_case = true }
642                                 `m` { current_flags.multiline = true }
643                                 `s` { current_flags.dot_all = true }
644                                 else {}
645                             }
646 
647                             pos++
648                         }
649                         if pos < pattern.len {
650                             pos++
651                         }
652                         continue
653                     } else if pos < pattern.len && pattern[pos] == `:` {
654                         cap = false
655                         pos++
656                     } else if pos < pattern.len && pattern[pos] == `P` {
657                         pos++
658                         if pos >= pattern.len || pattern[pos] != `<` {
659                             return error('Invalid named group syntax')
660                         }
661                         pos++
662                         end := pattern.index_after('>', pos) or { -1 }
663                         if end == -1 {
664                             return error('Unclosed named group')
665                         }
666                         name := pattern[pos..end]
667                         idx = group_counter
668                         group_map[name] = idx
669                         pos = end + 1
670                     }
671                 }
672                 if cap {
673                     if idx == -1 {
674                         idx = group_counter
675                     }
676                     group_counter++
677                 }
678                 sub, new_p, new_c := parse_nodes(pattern, pos, `)`, group_counter, current_flags, mut
679                     group_map)!
680                 pos = new_p
681                 group_counter = new_c
682                 parsed_nodes << Node{
683                     typ:                 .group
684                     nodes:               sub
685                     group_capture_index: idx
686                 }
687             }
688             `[` {
689                 pos += char_len
690                 end := pattern.index_after(']', pos) or { -1 }
691                 if end == -1 {
692                     return error('Unclosed character class')
693                 }
694                 content := pattern[pos..end]
695                 pos = end + 1
696                 inv := content.len > 0 && content[0] == `^`
697                 mut set := []rune{}
698                 mut i := if inv { 1 } else { 0 }
699                 for i < content.len {
700                     c, l := read_rune_at(content.str, content.len, i)
701                     if i + l + 1 < content.len && content[i + l] == `-` {
702                         ec, el := read_rune_at(content.str, content.len, i + l + 1)
703                         for r := c; r <= ec; r++ {
704                             set << r
705                         }
706                         i += l + 1 + el
707                     } else {
708                         set << c
709                         i += l
710                     }
711                 }
712                 parsed_nodes << Node{
713                     typ:         .char_class
714                     char_set:    set
715                     inverted:    inv
716                     ignore_case: current_flags.ignore_case
717                 }
718             }
719             `\\` {
720                 pos += char_len
721                 if pos >= pattern.len {
722                     return error('Trailing backslash')
723                 }
724                 esc, el := read_rune_at(pattern.str, pattern.len, pos)
725                 pos += el
726                 match esc {
727                     `w` {
728                         parsed_nodes << Node{
729                             typ: .word_char
730                         }
731                     }
732                     `W` {
733                         parsed_nodes << Node{
734                             typ: .non_word_char
735                         }
736                     }
737                     `d` {
738                         parsed_nodes << Node{
739                             typ: .digit
740                         }
741                     }
742                     `D` {
743                         parsed_nodes << Node{
744                             typ: .non_digit
745                         }
746                     }
747                     `s` {
748                         parsed_nodes << Node{
749                             typ: .whitespace
750                         }
751                     }
752                     `S` {
753                         parsed_nodes << Node{
754                             typ: .non_whitespace
755                         }
756                     }
757                     `b` {
758                         parsed_nodes << Node{
759                             typ: .word_boundary
760                         }
761                     }
762                     `B` {
763                         parsed_nodes << Node{
764                             typ: .non_word_boundary
765                         }
766                     }
767                     `a` {
768                         parsed_nodes << Node{
769                             typ: .lowercase_char
770                         }
771                     }
772                     `A` {
773                         parsed_nodes << Node{
774                             typ: .uppercase_char
775                         }
776                     }
777                     else {
778                         parsed_nodes << Node{
779                             typ:         .chr
780                             chr:         esc
781                             ignore_case: current_flags.ignore_case
782                         }
783                     }
784                 }
785             }
786             else {
787                 pos += char_len
788                 parsed_nodes << Node{
789                     typ:         .chr
790                     chr:         chr
791                     ignore_case: current_flags.ignore_case
792                 }
793             }
794         }
795 
796         if parsed_nodes.len > 0 {
797             mut q := Quantifier{1, 1, true}
798             if pos < pattern.len {
799                 peek := pattern[pos]
800                 match peek {
801                     `*` {
802                         q = Quantifier{0, -1, true}
803                         pos++
804                     }
805                     `+` {
806                         q = Quantifier{1, -1, true}
807                         pos++
808                     }
809                     `?` {
810                         q = Quantifier{0, 1, true}
811                         pos++
812                     }
813                     `{` {
814                         end := pattern.index_after('}', pos) or { -1 }
815                         if end == -1 {
816                             return error('Unclosed quantifier')
817                         }
818                         parts := pattern[pos + 1..end].split(',')
819                         min := parts[0].int()
820                         max := if parts.len > 1 {
821                             if parts[1] == '' { -1 } else { parts[1].int() }
822                         } else {
823                             min
824                         }
825                         q = Quantifier{min, max, true}
826                         pos = end + 1
827                     }
828                     else {}
829                 }
830 
831                 if pos < pattern.len && pattern[pos] == `?` {
832                     q.greedy = false
833                     pos++
834                 }
835             }
836             parsed_nodes.last().quant = q
837         }
838         current_sequence << parsed_nodes
839     }
840 
841     if alternatives.len > 0 {
842         if current_sequence.len == 0 {
843             return error('Empty alternative at end of pattern')
844         }
845         alternatives << current_sequence
846         return [
847             Node{
848                 typ:          .alternation
849                 alternatives: alternatives
850                 quant:        Quantifier{1, 1, true}
851             },
852         ], pos, group_counter
853     }
854     return current_sequence, pos, group_counter
855 }
856 
857 /******************************************************************************
858 *
859 * Virtual Machine Execution Engine (Highly Optimized)
860 *
861 ******************************************************************************/
862 
863 // vm_match executes the bytecode against the input string using the provided Machine state.
864 // OPTIMIZATION: Uses raw pointers for instruction and stack access to bypass bounds checking.
865 @[direct_array_access]
866 fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match {
867     unsafe {
868         // Optimization: Cast voidptr to typed pointer for direct indexing
869         mut cap_ptr := &int(m.captures.data)
870         cap_len := m.captures.len
871 
872         // Fast clear of captures using pointer arithmetic (memset-like)
873         for i := 0; i < cap_len; i++ {
874             cap_ptr[i] = -1
875         }
876 
877         mut sp := start_pos // String Pointer Index
878         mut stack_ptr := 0 // Stack Pointer Offset
879 
880         cap_size := r.total_groups * 2
881         frame_size := cap_size + 2 // [captures..., saved_sp, saved_pc]
882 
883         // Raw pointers for hot path access
884         prog_start := &Inst(r.prog.data)
885         mut inst_ptr := prog_start // PC as a pointer
886 
887         str_ptr := text.str
888         str_len := text.len
889 
890         // Cache stack data pointer (cast to typed pointer)
891         mut stack_data := &int(m.stack.data)
892         mut stack_max := m.stack.len
893 
894         for {
895             // Check if we walked off the program (should be caught by match inst)
896             // Using pointer arithmetic: offset = (inst_ptr - prog_start)
897 
898             match inst_ptr.typ {
899                 .match {
900                     // Only allocate result strings on successful match
901                     mut s_groups := []string{cap: r.total_groups}
902                     for i := 0; i < r.total_groups; i++ {
903                         s, e := cap_ptr[i * 2], cap_ptr[i * 2 + 1]
904                         s_groups << if s != -1 && e >= s { text[s..e] } else { '' }
905                     }
906                     return Match{
907                         text:   text[start_pos..sp]
908                         start:  start_pos
909                         end:    sp
910                         groups: s_groups
911                     }
912                 }
913                 .char {
914                     if sp >= str_len {
915                         goto backtrack
916                     }
917                     curr_byte := str_ptr[sp]
918 
919                     // Fast ASCII path
920                     if curr_byte < 128 && inst_ptr.val < 128 {
921                         mut c1, mut c2 := curr_byte, u8(inst_ptr.val)
922                         if inst_ptr.ignore_case {
923                             if c1 >= `a` && c1 <= `z` {
924                                 c1 -= 32
925                             }
926                             if c2 >= `a` && c2 <= `z` {
927                                 c2 -= 32
928                             }
929                         }
930                         if c1 == c2 {
931                             sp++
932                             inst_ptr++
933                             continue
934                         }
935                         goto backtrack
936                     }
937 
938                     // UTF-8 Path
939                     rn, l := read_rune_at(str_ptr, str_len, sp)
940                     if l == 0 {
941                         goto backtrack
942                     }
943 
944                     mut match_ok := false
945                     if inst_ptr.ignore_case {
946                         r1 := if rn >= `a` && rn <= `z` { rn - 32 } else { rn }
947                         r2 := if inst_ptr.val >= `a` && inst_ptr.val <= `z` {
948                             inst_ptr.val - 32
949                         } else {
950                             inst_ptr.val
951                         }
952                         if r1 == r2 {
953                             match_ok = true
954                         }
955                     } else if rn == inst_ptr.val {
956                         match_ok = true
957                     }
958 
959                     if match_ok {
960                         sp += l
961                         inst_ptr++
962                     } else {
963                         goto backtrack
964                     }
965                 }
966                 .string {
967                     if sp + inst_ptr.val_len > str_len {
968                         goto backtrack
969                     }
970                     // Inline memcmp
971                     v_str := inst_ptr.val_str.str
972                     for i in 0 .. inst_ptr.val_len {
973                         if str_ptr[sp + i] != v_str[i] {
974                             goto backtrack
975                         }
976                     }
977                     sp += inst_ptr.val_len
978                     inst_ptr++
979                 }
980                 .class {
981                     if sp >= str_len {
982                         goto backtrack
983                     }
984                     c_byte := str_ptr[sp]
985                     mut matched := false
986                     mut cl := 1
987 
988                     // Optimization: Bitmap lookup for ASCII
989                     if c_byte < 128 {
990                         if (inst_ptr.bitmap[c_byte >> 5] & (u32(1) << (c_byte & 31))) != 0 {
991                             matched = true
992                         }
993                     } else {
994                         rn, l := read_rune_at(str_ptr, str_len, sp)
995                         cl = l
996                         for cr in inst_ptr.char_class {
997                             if cr == rn {
998                                 matched = true
999                                 break
1000                             }
1001                         }
1002                     }
1003 
1004                     if matched != inst_ptr.inverted {
1005                         sp += cl
1006                         inst_ptr++
1007                     } else {
1008                         goto backtrack
1009                     }
1010                 }
1011                 .any {
1012                     if sp >= str_len {
1013                         goto backtrack
1014                     }
1015                     if inst_ptr.dot_all || str_ptr[sp] != `\n` {
1016                         _, cl := read_rune_at(str_ptr, str_len, sp)
1017                         sp += cl
1018                         inst_ptr++
1019                     } else {
1020                         goto backtrack
1021                     }
1022                 }
1023                 .save {
1024                     cap_ptr[inst_ptr.group_idx] = sp
1025                     inst_ptr++
1026                 }
1027                 .split {
1028                     if stack_ptr + frame_size >= stack_max {
1029                         new_size := stack_max * 2
1030                         if new_size > 1_000_000 {
1031                             goto backtrack
1032                         }
1033                         m.stack.grow_len(new_size)
1034                         stack_data = &int(m.stack.data) // Pointer might change on realloc
1035                         stack_max = new_size
1036                     }
1037 
1038                     // Optimization: Unrolled stack push
1039                     stack_offset := stack_ptr
1040                     for i in 0 .. cap_size {
1041                         stack_data[stack_offset + i] = cap_ptr[i]
1042                     }
1043                     stack_data[stack_offset + cap_size] = sp
1044                     // Save backtrack target PC index
1045                     stack_data[stack_offset + cap_size + 1] = inst_ptr.target_y
1046 
1047                     stack_ptr += frame_size
1048                     // Jump to primary target (convert index to pointer)
1049                     // FIX: Use pointer indexing instead of addition to avoid type mismatch
1050                     inst_ptr = &prog_start[inst_ptr.target_x]
1051                 }
1052                 .jmp {
1053                     inst_ptr = &prog_start[inst_ptr.target_x]
1054                 }
1055                 .assert_start {
1056                     if sp == 0 {
1057                         inst_ptr++
1058                     } else {
1059                         goto backtrack
1060                     }
1061                 }
1062                 .assert_end {
1063                     if sp == str_len {
1064                         inst_ptr++
1065                     } else {
1066                         goto backtrack
1067                     }
1068                 }
1069                 .assert_line_start {
1070                     if sp == 0 || (sp > 0 && str_ptr[sp - 1] == `\n`) {
1071                         inst_ptr++
1072                     } else {
1073                         goto backtrack
1074                     }
1075                 }
1076                 .assert_line_end {
1077                     if sp == str_len || str_ptr[sp] == `\n` {
1078                         inst_ptr++
1079                     } else {
1080                         goto backtrack
1081                     }
1082                 }
1083                 .assert_bound, .assert_nbound {
1084                     l := if sp > 0 { is_word_char(str_ptr[sp - 1]) } else { false }
1085                     r_ := if sp < str_len { is_word_char(str_ptr[sp]) } else { false }
1086                     if (inst_ptr.typ == .assert_bound && l != r_)
1087                         || (inst_ptr.typ == .assert_nbound && l == r_) {
1088                         inst_ptr++
1089                     } else {
1090                         goto backtrack
1091                     }
1092                 }
1093             }
1094 
1095             continue
1096 
1097             backtrack:
1098             if stack_ptr <= 0 {
1099                 return none
1100             }
1101 
1102             stack_ptr -= frame_size
1103             stack_offset := stack_ptr
1104 
1105             // Restore captures
1106             for i in 0 .. cap_size {
1107                 cap_ptr[i] = stack_data[stack_offset + i]
1108             }
1109             sp = stack_data[stack_offset + cap_size]
1110             // Restore PC from index (explicit cast for pointer arithmetic)
1111             // FIX: Use pointer indexing instead of addition
1112             inst_ptr = &prog_start[stack_data[stack_offset + cap_size + 1]]
1113         }
1114     }
1115     return none
1116 }
1117 
1118 /******************************************************************************
1119 *
1120 * Public API
1121 *
1122 ******************************************************************************/
1123 
1124 // find returns the first match of the regex in text.
1125 pub fn (r &Regex) find(text string) ?Match {
1126     return r.find_from(text, 0)
1127 }
1128 
1129 // find_from returns the first match starting from start_index.
1130 // Optimized with fast prefix skipping and anchor checks.
1131 pub fn (r &Regex) find_from(text string, start_index int) ?Match {
1132     if start_index < 0 || start_index > text.len {
1133         return none
1134     }
1135     mut m := r.new_machine()
1136 
1137     // Optimization: Anchored pattern (^) only checks the start
1138     if r.anchored {
1139         if start_index == 0 {
1140             return r.vm_match(text, 0, mut m)
1141         }
1142         // If multiline mode is NOT enabled, ^ only matches index 0.
1143         // If multiline is enabled, we need to check every newline, handled by logic below.
1144         // Note: The compiler sets anchored=true only for ^ at start.
1145         // If multiline flag is set dynamically inside pattern, strict anchoring logic might differ,
1146         // but standard ^ usage benefits here.
1147     }
1148 
1149     // Optimization: Boyer-Moore-like literal prefix skip
1150     if r.has_prefix {
1151         mut i := text.index_after(r.prefix_lit, start_index) or { -1 }
1152         for i != -1 {
1153             if res := r.vm_match(text, i, mut m) {
1154                 return res
1155             }
1156             i = text.index_after(r.prefix_lit, i + 1) or { -1 }
1157         }
1158         return none
1159     }
1160 
1161     for i := start_index; i <= text.len; i++ {
1162         // Skip UTF-8 continuation bytes to ensure we only match at rune boundaries
1163         // 0xC0 (11000000) is the start of a multi-byte sequence, 0x80 (10000000) is a continuation
1164         if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1165             continue
1166         }
1167         if res := r.vm_match(text, i, mut m) {
1168             return res
1169         }
1170     }
1171     return none
1172 }
1173 
1174 // find_all returns all non-overlapping matches in text.
1175 pub fn (r &Regex) find_all(text string) []Match {
1176     mut matches := []Match{}
1177     mut m := r.new_machine()
1178     mut i := 0
1179     for i <= text.len {
1180         if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1181             i++
1182             continue
1183         }
1184         if res := r.vm_match(text, i, mut m) {
1185             matches << res
1186             i = if res.end > i { res.end } else { i + 1 }
1187         } else {
1188             i++
1189         }
1190     }
1191     return matches
1192 }
1193 
1194 // replace finds the first match and replaces it using repl. Supports $1, $2 backreferences.
1195 pub fn (r &Regex) replace(text string, repl string) string {
1196     res := r.find(text) or { return text }
1197     mut sb := strings.new_builder(text.len + repl.len)
1198     sb.write_string(text[0..res.start])
1199     mut i := 0
1200     for i < repl.len {
1201         if repl[i] == `$` && i + 1 < repl.len {
1202             next_char := repl[i + 1]
1203             if next_char >= `0` && next_char <= `9` {
1204                 d := next_char - `0` - 1
1205                 if d >= 0 && d < res.groups.len {
1206                     sb.write_string(res.groups[d])
1207                 }
1208                 i += 2
1209                 continue
1210             }
1211         }
1212         sb.write_u8(repl[i])
1213         i++
1214     }
1215     sb.write_string(text[res.end..])
1216     return sb.str()
1217 }
1218 
1219 // fullmatch returns a Match only if the pattern matches the entire text.
1220 pub fn (r &Regex) fullmatch(text string) ?Match {
1221     mut m := r.new_machine()
1222     if res := r.vm_match(text, 0, mut m) {
1223         if res.end == text.len {
1224             return res
1225         }
1226     }
1227     return none
1228 }
1229 
1230 // group_by_name retrieves text captured by a named group.
1231 pub fn (r &Regex) group_by_name(m Match, name string) string {
1232     idx := r.group_map[name] or { return '' }
1233     return if idx < m.groups.len { m.groups[idx] } else { '' }
1234 }
1235 
1236 /******************************************************************************
1237 *
1238 * Compatibility Layer
1239 *
1240 ******************************************************************************/
1241 
1242 // new_regex is a helper wrapper to compile a regex pattern.
1243 pub fn new_regex(pattern string, _ int) !Regex {
1244     return compile(pattern)
1245 }
1246 
1247 // match_str is a compatibility alias for find_from.
1248 pub fn (r &Regex) match_str(text string, start_index int, _ int) ?Match {
1249     return r.find_from(text, start_index)
1250 }
1251 
1252 // get returns the matched text for a specific group index.
1253 // Index 0 returns the full match, 1..n returns capture groups.
1254 pub fn (m Match) get(idx int) ?string {
1255     if idx == 0 {
1256         return m.text
1257     }
1258     if idx > 0 && idx <= m.groups.len {
1259         return m.groups[idx - 1]
1260     }
1261     return none
1262 }
1263 
1264 // get_all returns a list of all captured strings, starting with the full match at index 0.
1265 pub fn (m Match) get_all() []string {
1266     mut res := [m.text]
1267     res << m.groups
1268     return res
1269 }
1270