v2 / vlib / regex / pcre / regex.v
1269 lines · 1179 sloc · 32.58 KB · 8e35f4d9848f7ad35d857a187dddbfd2eca5e19d
Raw
1/*
2regex2 0.9.6 beta (VM Edition) - Performance Optimized
3
4Copyright (c) 2026 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8This file contains a regex module based on a Virtual Machine approach.
9
10Features:
11 - **Non-recursive VM**: Safe execution without stack overflow on complex patterns.
12 - UTF8 support with Fast ASCII path.
13 - Literal characters, '.', '*', '{m,n}'.
14 - Short quantifiers: '?', '+'.
15 - Non-greedy quantifiers: '*?', '+?', '??'.
16 - Nested groups: '()'.
17 - Named groups: '(?P<name>...)'.
18 - Non-capturing groups: '(?:...)'.
19 - Character classes: \w, \W, \d, \D, \s, \S, \a, \A.
20 - Hex escapes: \xHH, \XHHHH.
21 - Alternation: '|'.
22 - Character classes: [abc], [^abc], [a-z].
23 - Anchors: '^' (start of string), '$' (end of string).
24 - Word boundaries: '\b' (word boundary), '\B' (non-word boundary).
25 - Inline Flags:
26 - '(?i)' Case insensitive.
27 - '(?m)' Multiline (anchors match newlines).
28 - '(?s)' Dot-all (dot matches newline).
29
30Configuration:
31 - `max_stack_depth`: Controls the dynamic growth limit of the backtracking stack.
32 Default is 2048. Increase this value if you encounter complex patterns failing
33 on deep recursions/backtracking, or decrease it to limit memory usage.
34
35Functions:
36 - `compile(pattern) !Regex` -> Compiles a pattern into a Regex object.
37 - `(r Regex) find(text) ?Match` -> Finds the first match in a string.
38 - `(r Regex) find_all(text) []Match` -> Finds all non-overlapping matches.
39 - `(r Regex) find_from(text, start_index) ?Match` -> Finds the first match starting from a specific index.
40 - `(r Regex) replace(text, repl) string` -> Finds and replaces the first match.
41 - `(r Regex) fullmatch(text string) ?Match` -> Match exact strings.
42 - `(r Regex) group_by_name(m Match, name string) string` -> Get text of a named group.
43
44Key Architectural Features and Optimizations:
45 - **Pre-allocated Machine workspace**: All search operations are zero-allocation.
46 - **Raw pointer access**: Bypasses bounds checking for string and instruction access.
47 - **Fast ASCII Path**: Characters < 128 bypass heavy UTF-8 decoding.
48 - **Instruction merging**: Consecutive character matches are merged into string blocks.
49 - **Bitmap lookups**: ASCII character classes use a 128-bit bitset for O(1) matching.
50 - **NFA Virtual Machine**: Executes bytecode instructions to simulate pattern matching.
51 - **Dynamic Stack Growth**: Automatically expands the backtracking stack to prevent false negatives.
52 - **Zero-Allocation Search**: Reuses a pre-allocated Machine workspace for search operations.
53 - **Anchored Optimization**: Patterns starting with '^' skip the scanning loop.
54 - **Prefix Skipping**: Uses Boyer-Moore-like skipping for literal prefixes.
55
56*/
57
58module pcre
59
60import strings
61
62/******************************************************************************
63*
64* VM Instruction Definition
65*
66******************************************************************************/
67
68// InstType defines the operation codes for the Virtual Machine.
69enum InstType as u8 {
70 match // Halt and signal a successful match.
71 char // Match a single UTF-8 rune.
72 string // Match a sequence of ASCII characters (merged optimization).
73 any // Match any character (dot).
74 class // Match a character class (e.g., [a-z] or \d).
75 split // Branch execution: target_x (primary), target_y (backtrack).
76 jmp // Unconditional jump to target_x.
77 save // Save current string position into a capture group index.
78 assert_start // Assert current position is start of string.
79 assert_end // Assert current position is end of string.
80 assert_line_start // Assert current position is start of a line (multiline mode).
81 assert_line_end // Assert current position is end of a line (multiline mode).
82 assert_bound // Assert word boundary (\b).
83 assert_nbound // Assert non-word boundary (\B).
84}
85
86// Inst represents a single bytecode instruction.
87// Packed for memory locality.
88struct Inst {
89mut:
90 typ InstType
91 val rune // Used by .char
92 val_str string // Used by .string
93 val_len int // Byte length of val_str
94 target_x int // Primary jump/split target
95 target_y int // Backtrack target for .split
96 group_idx int // Capture group index for .save
97 char_class []rune // Literal runes for character classes
98 bitmap [4]u32 // 128-bit bitset for ASCII character classes
99 inverted bool // Negation flag for classes
100 ignore_case bool // Case-insensitive flag
101 dot_all bool // Whether '.' matches '\n'
102}
103
104// Machine provides a workspace for VM execution.
105// To ensure thread safety, this is created per top-level API call.
106pub struct Machine {
107mut:
108 stack []int // Backtracking stack stores: [capture_states..., string_ptr, next_pc]
109 captures []int // Flat array of [start, end] byte indices for groups
110}
111
112// Regex is the compiled regular expression object.
113pub struct Regex {
114pub:
115 pattern string // The original regex string
116 prog []Inst // Compiled bytecode
117 total_groups int // Number of capture groups defined in pattern
118 group_map map[string]int // Mapping of names to indices for (?P<name>...)
119 prefix_lit string // Pre-calculated literal prefix for fast-skip optimization
120 has_prefix bool // Whether a literal prefix exists
121 anchored bool // True if pattern starts with '^' (optimization hint)
122pub mut:
123 max_stack_depth int // User-defined stack limit hint
124}
125
126// Match contains the results of a successful regex match.
127pub struct Match {
128pub:
129 text string // The full text of the match
130 start int // Starting byte index in the source string
131 end int // Ending byte index in the source string
132 groups []string // Sub-strings captured by groups
133}
134
135// Internal structures for Compilation
136struct Quantifier {
137mut:
138 min int
139 max int // -1 for infinity
140 greedy bool
141}
142
143struct Flags {
144mut:
145 ignore_case bool
146 multiline bool
147 dot_all bool
148}
149
150enum NodeType {
151 chr
152 any_char
153 group
154 alternation
155 char_class
156 start_of_string
157 end_of_string
158 word_boundary
159 non_word_boundary
160 word_char
161 non_word_char
162 digit
163 non_digit
164 whitespace
165 non_whitespace
166 lowercase_char
167 uppercase_char
168}
169
170struct Node {
171mut:
172 typ NodeType
173 chr rune
174 quant Quantifier
175 nodes []Node
176 alternatives [][]Node
177 group_capture_index int = -1
178 char_set []rune
179 inverted bool
180 ignore_case bool
181 multiline bool
182 dot_all bool
183}
184
185/******************************************************************************
186*
187* Internal Utilities (Inlined for Speed)
188*
189******************************************************************************/
190
191// read_rune_at decodes a UTF-8 rune from a byte pointer safely.
192// Marked inline to be embedded directly into the VM loop.
193@[inline]
194fn read_rune_at(str &u8, len int, index int) (rune, int) {
195 unsafe {
196 if index >= len {
197 return 0, 0
198 }
199 b0 := u32(str[index])
200 if b0 < 0x80 {
201 return rune(b0), 1
202 }
203 if (b0 & 0xE0) == 0xC0 && index + 1 < len {
204 return rune(((b0 & 0x1F) << 6) | (u32(str[index + 1]) & 0x3F)), 2
205 }
206 if (b0 & 0xF0) == 0xE0 && index + 2 < len {
207 return rune(((b0 & 0x0F) << 12) | ((u32(str[index + 1]) & 0x3F) << 6) | (u32(str[
208 index + 2]) & 0x3F)), 3
209 }
210 if (b0 & 0xF8) == 0xF0 && index + 3 < len {
211 return rune(((b0 & 0x07) << 18) | ((u32(str[index + 1]) & 0x3F) << 12) | ((u32(str[
212 index + 2]) & 0x3F) << 6) | (u32(str[index + 3]) & 0x3F)), 4
213 }
214 }
215 return 0, 0
216}
217
218// is_word_char returns true for alphanumeric ASCII characters and underscore.
219@[inline]
220fn is_word_char(r u8) bool {
221 return (r >= `a` && r <= `z`) || (r >= `A` && r <= `Z`) || (r >= `0` && r <= `9`) || r == `_`
222}
223
224// set_bitmap sets a specific bit in a 128-bit bitset for ASCII matching.
225@[inline]
226fn set_bitmap(mut bitmap [4]u32, r rune) {
227 if r >= 0 && r < 128 {
228 idx := u32(r) >> 5
229 bit := u32(r) & 31
230 bitmap[idx] |= (u32(1) << bit)
231 }
232}
233
234/******************************************************************************
235*
236* Compiler Logic
237*
238******************************************************************************/
239
240// compile transforms a regex pattern string into a Regex object.
241pub fn compile(pattern string) !Regex {
242 mut group_map := map[string]int{}
243 initial_flags := Flags{false, false, false}
244
245 // Phase 1: AST Parsing
246 nodes, _, final_group_count := parse_nodes(pattern, 0, `\0`, 0, initial_flags, mut group_map)!
247
248 root := Node{
249 typ: .group
250 nodes: nodes
251 quant: Quantifier{1, 1, true}
252 }
253
254 // Phase 2: Bytecode Emission
255 mut compiler := Compiler{
256 prog: []Inst{cap: 128}
257 }
258 compiler.emit_node(root)
259 compiler.emit(Inst{ typ: .match })
260
261 // Phase 3: Optimization
262 optimized_prog := compiler.optimize()
263
264 // Detect Prefix and Anchor optimizations
265 mut prefix := ''
266 mut has_prefix := false
267 mut anchored := false
268
269 if optimized_prog.len > 0 {
270 first := optimized_prog[0]
271 if first.typ == .string {
272 prefix = first.val_str
273 has_prefix = true
274 } else if first.typ == .char && !first.ignore_case && first.val < 128 {
275 prefix = unsafe { u8(first.val).ascii_str() }
276 has_prefix = true
277 } else if first.typ == .assert_start {
278 anchored = true
279 }
280 }
281
282 return Regex{
283 max_stack_depth: 2048
284 pattern: pattern
285 prog: optimized_prog
286 total_groups: final_group_count
287 group_map: group_map
288 prefix_lit: prefix
289 has_prefix: has_prefix
290 anchored: anchored
291 }
292}
293
294// new_machine allocates a new VM state machine.
295// This isolates the runtime memory (stack/captures) from the compiled regex, allowing thread-safe usage.
296pub fn (r &Regex) new_machine() Machine {
297 // Pre-allocate enough space for stack and captures to avoid re-allocation in hot path
298 return Machine{
299 stack: []int{len: r.max_stack_depth}
300 captures: []int{len: r.total_groups * 2}
301 }
302}
303
304// Compiler holds the state for generating the bytecode instructions.
305struct Compiler {
306mut:
307 prog []Inst
308}
309
310// emit appends an instruction to the program and returns its index.
311fn (mut c Compiler) emit(i Inst) int {
312 c.prog << i
313 return c.prog.len - 1
314}
315
316// optimize merges consecutive literal characters into single string instructions
317// and resolves jump targets to absolute indices.
318fn (mut c Compiler) optimize() []Inst {
319 mut targets := map[int]bool{}
320 for inst in c.prog {
321 if inst.typ == .split || inst.typ == .jmp {
322 targets[inst.target_x] = true
323 targets[inst.target_y] = true
324 }
325 }
326
327 mut new_prog := []Inst{cap: c.prog.len}
328 mut idx_map := map[int]int{}
329
330 mut i := 0
331 for i < c.prog.len {
332 inst := c.prog[i]
333 idx_map[i] = new_prog.len
334
335 // Optimization: Merge consecutive chars
336 if inst.typ == .char && !inst.ignore_case && inst.val < 128 {
337 mut s_val := unsafe { u8(inst.val).ascii_str() }
338 mut j := i + 1
339 for j < c.prog.len && !targets[j] {
340 next := c.prog[j]
341 if next.typ == .char && !next.ignore_case && next.val < 128 {
342 s_val += unsafe { u8(next.val).ascii_str() }
343 j++
344 } else {
345 break
346 }
347 }
348 if j > i + 1 {
349 new_prog << Inst{
350 typ: .string
351 val_str: s_val
352 val_len: s_val.len
353 }
354 for k := i + 1; k < j; k++ {
355 idx_map[k] = new_prog.len
356 }
357 i = j
358 continue
359 }
360 }
361 new_prog << inst
362 i++
363 }
364 idx_map[c.prog.len] = new_prog.len
365
366 // Fix jump offsets
367 for mut inst in new_prog {
368 if inst.typ == .split || inst.typ == .jmp {
369 inst.target_x = idx_map[inst.target_x] or { inst.target_x }
370 inst.target_y = idx_map[inst.target_y] or { inst.target_y }
371 }
372 }
373 return new_prog
374}
375
376// emit_class generates the instructions for a character class node.
377// It populates the bitmap for O(1) ASCII matching and the slice for Unicode.
378fn (mut c Compiler) emit_class(node Node) {
379 mut bitmap := [4]u32{}
380 mut char_class := node.char_set.clone()
381 mut inverted := node.inverted
382
383 // Pre-compile common classes into the bitmap for O(1) lookups
384 match node.typ {
385 .word_char {
386 for r := `0`; r <= `9`; r++ {
387 set_bitmap(mut bitmap, r)
388 }
389 for r := `a`; r <= `z`; r++ {
390 set_bitmap(mut bitmap, r)
391 }
392 for r := `A`; r <= `Z`; r++ {
393 set_bitmap(mut bitmap, r)
394 }
395 set_bitmap(mut bitmap, `_`)
396 }
397 .non_word_char {
398 inverted = true
399 for r := `0`; r <= `9`; r++ {
400 set_bitmap(mut bitmap, r)
401 }
402 for r := `a`; r <= `z`; r++ {
403 set_bitmap(mut bitmap, r)
404 }
405 for r := `A`; r <= `Z`; r++ {
406 set_bitmap(mut bitmap, r)
407 }
408 set_bitmap(mut bitmap, `_`)
409 }
410 .digit {
411 for r := `0`; r <= `9`; r++ {
412 set_bitmap(mut bitmap, r)
413 }
414 }
415 .non_digit {
416 inverted = true
417 for r := `0`; r <= `9`; r++ {
418 set_bitmap(mut bitmap, r)
419 }
420 }
421 .whitespace {
422 for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
423 set_bitmap(mut bitmap, r)
424 }
425 }
426 .non_whitespace {
427 inverted = true
428 for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] {
429 set_bitmap(mut bitmap, r)
430 }
431 }
432 .lowercase_char {
433 for r := `a`; r <= `z`; r++ {
434 set_bitmap(mut bitmap, r)
435 }
436 }
437 .uppercase_char {
438 for r := `A`; r <= `Z`; r++ {
439 set_bitmap(mut bitmap, r)
440 }
441 }
442 .char_class {
443 for r in node.char_set {
444 set_bitmap(mut bitmap, r)
445 if node.ignore_case {
446 if r >= `a` && r <= `z` {
447 set_bitmap(mut bitmap, r - 32)
448 } else if r >= `A` && r <= `Z` {
449 set_bitmap(mut bitmap, r + 32)
450 }
451 }
452 }
453 }
454 else {}
455 }
456
457 c.emit(Inst{
458 typ: .class
459 char_class: char_class
460 bitmap: bitmap
461 inverted: inverted
462 ignore_case: node.ignore_case
463 })
464}
465
466// emit_node handles quantifiers and loops, delegating the actual logic to emit_logic.
467fn (mut c Compiler) emit_node(node Node) {
468 for _ in 0 .. node.quant.min {
469 c.emit_logic(node)
470 }
471
472 if node.quant.max == -1 {
473 split_idx := c.emit(Inst{ typ: .split })
474 start_pc := c.prog.len
475 c.emit_logic(node)
476 c.emit(Inst{ typ: .jmp, target_x: split_idx })
477 if node.quant.greedy {
478 c.prog[split_idx].target_x = start_pc
479 c.prog[split_idx].target_y = c.prog.len
480 } else {
481 c.prog[split_idx].target_x = c.prog.len
482 c.prog[split_idx].target_y = start_pc
483 }
484 } else if node.quant.max > node.quant.min {
485 rem := node.quant.max - node.quant.min
486 mut splits := []int{}
487 for _ in 0 .. rem {
488 idx := c.emit(Inst{ typ: .split })
489 match_pc := c.prog.len
490 if node.quant.greedy {
491 c.prog[idx].target_x = match_pc
492 } else {
493 c.prog[idx].target_y = match_pc
494 }
495 c.emit_logic(node)
496 splits << idx
497 }
498 end_pc := c.prog.len
499 for idx in splits {
500 if node.quant.greedy {
501 c.prog[idx].target_y = end_pc
502 } else {
503 c.prog[idx].target_x = end_pc
504 }
505 }
506 }
507}
508
509// emit_logic generates instructions for specific node types (char, group, alternation).
510fn (mut c Compiler) emit_logic(node Node) {
511 match node.typ {
512 .chr {
513 c.emit(Inst{ typ: .char, val: node.chr, ignore_case: node.ignore_case })
514 }
515 .any_char {
516 c.emit(Inst{ typ: .any, dot_all: node.dot_all })
517 }
518 .group {
519 if node.group_capture_index != -1 {
520 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 })
521 }
522 for child in node.nodes {
523 c.emit_node(child)
524 }
525 if node.group_capture_index != -1 {
526 c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 + 1 })
527 }
528 }
529 .alternation {
530 if node.alternatives.len == 0 {
531 return
532 }
533 mut end_jumps := []int{}
534 for i := 0; i < node.alternatives.len - 1; i++ {
535 split_idx := c.emit(Inst{ typ: .split })
536 c.prog[split_idx].target_x = c.prog.len
537 for child in node.alternatives[i] {
538 c.emit_node(child)
539 }
540 end_jumps << c.emit(Inst{ typ: .jmp })
541 c.prog[split_idx].target_y = c.prog.len
542 }
543 for child in node.alternatives.last() {
544 c.emit_node(child)
545 }
546 for idx in end_jumps {
547 c.prog[idx].target_x = c.prog.len
548 }
549 }
550 .start_of_string {
551 c.emit(Inst{
552 typ: if node.multiline { InstType.assert_line_start } else { InstType.assert_start }
553 })
554 }
555 .end_of_string {
556 c.emit(Inst{
557 typ: if node.multiline { InstType.assert_line_end } else { InstType.assert_end }
558 })
559 }
560 .word_boundary {
561 c.emit(Inst{ typ: .assert_bound })
562 }
563 .non_word_boundary {
564 c.emit(Inst{ typ: .assert_nbound })
565 }
566 else {
567 c.emit_class(node)
568 }
569 }
570}
571
572// parse_nodes implements a recursive descent parser to construct the AST from the pattern string.
573fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_start int, passed_flags Flags, mut group_map map[string]int) !([]Node, int, int) {
574 mut pos := pos_start
575 mut group_counter := group_counter_start
576 mut current_flags := passed_flags
577 mut alternatives := [][]Node{}
578 mut current_sequence := []Node{}
579
580 for pos < pattern.len {
581 chr, char_len := read_rune_at(pattern.str, pattern.len, pos)
582 if chr == terminator {
583 pos += char_len
584 if alternatives.len > 0 {
585 alternatives << current_sequence
586 return [
587 Node{
588 typ: .alternation
589 alternatives: alternatives
590 quant: Quantifier{1, 1, true}
591 },
592 ], pos, group_counter
593 }
594 return current_sequence, pos, group_counter
595 }
596 if chr == `|` {
597 pos += char_len
598 if current_sequence.len == 0 {
599 return error('Empty alternative')
600 }
601 alternatives << current_sequence
602 current_sequence = []Node{}
603 continue
604 }
605
606 mut parsed_nodes := []Node{}
607 match chr {
608 `^` {
609 pos += char_len
610 parsed_nodes << Node{
611 typ: .start_of_string
612 multiline: current_flags.multiline
613 }
614 }
615 `$` {
616 pos += char_len
617 parsed_nodes << Node{
618 typ: .end_of_string
619 multiline: current_flags.multiline
620 }
621 }
622 `.` {
623 pos += char_len
624 parsed_nodes << Node{
625 typ: .any_char
626 dot_all: current_flags.dot_all
627 }
628 }
629 `*`, `+`, `?`, `{` {
630 return error('Quantifier must follow a token')
631 }
632 `(` {
633 pos += char_len
634 mut idx := -1
635 mut cap := true
636 if pos < pattern.len && pattern[pos] == `?` {
637 pos++
638 if pos < pattern.len && pattern[pos] in [`i`, `m`, `s`] {
639 for pos < pattern.len && pattern[pos] != `)` {
640 match pattern[pos] {
641 `i` { current_flags.ignore_case = true }
642 `m` { current_flags.multiline = true }
643 `s` { current_flags.dot_all = true }
644 else {}
645 }
646
647 pos++
648 }
649 if pos < pattern.len {
650 pos++
651 }
652 continue
653 } else if pos < pattern.len && pattern[pos] == `:` {
654 cap = false
655 pos++
656 } else if pos < pattern.len && pattern[pos] == `P` {
657 pos++
658 if pos >= pattern.len || pattern[pos] != `<` {
659 return error('Invalid named group syntax')
660 }
661 pos++
662 end := pattern.index_after('>', pos) or { -1 }
663 if end == -1 {
664 return error('Unclosed named group')
665 }
666 name := pattern[pos..end]
667 idx = group_counter
668 group_map[name] = idx
669 pos = end + 1
670 }
671 }
672 if cap {
673 if idx == -1 {
674 idx = group_counter
675 }
676 group_counter++
677 }
678 sub, new_p, new_c := parse_nodes(pattern, pos, `)`, group_counter, current_flags, mut
679 group_map)!
680 pos = new_p
681 group_counter = new_c
682 parsed_nodes << Node{
683 typ: .group
684 nodes: sub
685 group_capture_index: idx
686 }
687 }
688 `[` {
689 pos += char_len
690 end := pattern.index_after(']', pos) or { -1 }
691 if end == -1 {
692 return error('Unclosed character class')
693 }
694 content := pattern[pos..end]
695 pos = end + 1
696 inv := content.len > 0 && content[0] == `^`
697 mut set := []rune{}
698 mut i := if inv { 1 } else { 0 }
699 for i < content.len {
700 c, l := read_rune_at(content.str, content.len, i)
701 if i + l + 1 < content.len && content[i + l] == `-` {
702 ec, el := read_rune_at(content.str, content.len, i + l + 1)
703 for r := c; r <= ec; r++ {
704 set << r
705 }
706 i += l + 1 + el
707 } else {
708 set << c
709 i += l
710 }
711 }
712 parsed_nodes << Node{
713 typ: .char_class
714 char_set: set
715 inverted: inv
716 ignore_case: current_flags.ignore_case
717 }
718 }
719 `\\` {
720 pos += char_len
721 if pos >= pattern.len {
722 return error('Trailing backslash')
723 }
724 esc, el := read_rune_at(pattern.str, pattern.len, pos)
725 pos += el
726 match esc {
727 `w` {
728 parsed_nodes << Node{
729 typ: .word_char
730 }
731 }
732 `W` {
733 parsed_nodes << Node{
734 typ: .non_word_char
735 }
736 }
737 `d` {
738 parsed_nodes << Node{
739 typ: .digit
740 }
741 }
742 `D` {
743 parsed_nodes << Node{
744 typ: .non_digit
745 }
746 }
747 `s` {
748 parsed_nodes << Node{
749 typ: .whitespace
750 }
751 }
752 `S` {
753 parsed_nodes << Node{
754 typ: .non_whitespace
755 }
756 }
757 `b` {
758 parsed_nodes << Node{
759 typ: .word_boundary
760 }
761 }
762 `B` {
763 parsed_nodes << Node{
764 typ: .non_word_boundary
765 }
766 }
767 `a` {
768 parsed_nodes << Node{
769 typ: .lowercase_char
770 }
771 }
772 `A` {
773 parsed_nodes << Node{
774 typ: .uppercase_char
775 }
776 }
777 else {
778 parsed_nodes << Node{
779 typ: .chr
780 chr: esc
781 ignore_case: current_flags.ignore_case
782 }
783 }
784 }
785 }
786 else {
787 pos += char_len
788 parsed_nodes << Node{
789 typ: .chr
790 chr: chr
791 ignore_case: current_flags.ignore_case
792 }
793 }
794 }
795
796 if parsed_nodes.len > 0 {
797 mut q := Quantifier{1, 1, true}
798 if pos < pattern.len {
799 peek := pattern[pos]
800 match peek {
801 `*` {
802 q = Quantifier{0, -1, true}
803 pos++
804 }
805 `+` {
806 q = Quantifier{1, -1, true}
807 pos++
808 }
809 `?` {
810 q = Quantifier{0, 1, true}
811 pos++
812 }
813 `{` {
814 end := pattern.index_after('}', pos) or { -1 }
815 if end == -1 {
816 return error('Unclosed quantifier')
817 }
818 parts := pattern[pos + 1..end].split(',')
819 min := parts[0].int()
820 max := if parts.len > 1 {
821 if parts[1] == '' { -1 } else { parts[1].int() }
822 } else {
823 min
824 }
825 q = Quantifier{min, max, true}
826 pos = end + 1
827 }
828 else {}
829 }
830
831 if pos < pattern.len && pattern[pos] == `?` {
832 q.greedy = false
833 pos++
834 }
835 }
836 parsed_nodes.last().quant = q
837 }
838 current_sequence << parsed_nodes
839 }
840
841 if alternatives.len > 0 {
842 if current_sequence.len == 0 {
843 return error('Empty alternative at end of pattern')
844 }
845 alternatives << current_sequence
846 return [
847 Node{
848 typ: .alternation
849 alternatives: alternatives
850 quant: Quantifier{1, 1, true}
851 },
852 ], pos, group_counter
853 }
854 return current_sequence, pos, group_counter
855}
856
857/******************************************************************************
858*
859* Virtual Machine Execution Engine (Highly Optimized)
860*
861******************************************************************************/
862
863// vm_match executes the bytecode against the input string using the provided Machine state.
864// OPTIMIZATION: Uses raw pointers for instruction and stack access to bypass bounds checking.
865@[direct_array_access]
866fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match {
867 unsafe {
868 // Optimization: Cast voidptr to typed pointer for direct indexing
869 mut cap_ptr := &int(m.captures.data)
870 cap_len := m.captures.len
871
872 // Fast clear of captures using pointer arithmetic (memset-like)
873 for i := 0; i < cap_len; i++ {
874 cap_ptr[i] = -1
875 }
876
877 mut sp := start_pos // String Pointer Index
878 mut stack_ptr := 0 // Stack Pointer Offset
879
880 cap_size := r.total_groups * 2
881 frame_size := cap_size + 2 // [captures..., saved_sp, saved_pc]
882
883 // Raw pointers for hot path access
884 prog_start := &Inst(r.prog.data)
885 mut inst_ptr := prog_start // PC as a pointer
886
887 str_ptr := text.str
888 str_len := text.len
889
890 // Cache stack data pointer (cast to typed pointer)
891 mut stack_data := &int(m.stack.data)
892 mut stack_max := m.stack.len
893
894 for {
895 // Check if we walked off the program (should be caught by match inst)
896 // Using pointer arithmetic: offset = (inst_ptr - prog_start)
897
898 match inst_ptr.typ {
899 .match {
900 // Only allocate result strings on successful match
901 mut s_groups := []string{cap: r.total_groups}
902 for i := 0; i < r.total_groups; i++ {
903 s, e := cap_ptr[i * 2], cap_ptr[i * 2 + 1]
904 s_groups << if s != -1 && e >= s { text[s..e] } else { '' }
905 }
906 return Match{
907 text: text[start_pos..sp]
908 start: start_pos
909 end: sp
910 groups: s_groups
911 }
912 }
913 .char {
914 if sp >= str_len {
915 goto backtrack
916 }
917 curr_byte := str_ptr[sp]
918
919 // Fast ASCII path
920 if curr_byte < 128 && inst_ptr.val < 128 {
921 mut c1, mut c2 := curr_byte, u8(inst_ptr.val)
922 if inst_ptr.ignore_case {
923 if c1 >= `a` && c1 <= `z` {
924 c1 -= 32
925 }
926 if c2 >= `a` && c2 <= `z` {
927 c2 -= 32
928 }
929 }
930 if c1 == c2 {
931 sp++
932 inst_ptr++
933 continue
934 }
935 goto backtrack
936 }
937
938 // UTF-8 Path
939 rn, l := read_rune_at(str_ptr, str_len, sp)
940 if l == 0 {
941 goto backtrack
942 }
943
944 mut match_ok := false
945 if inst_ptr.ignore_case {
946 r1 := if rn >= `a` && rn <= `z` { rn - 32 } else { rn }
947 r2 := if inst_ptr.val >= `a` && inst_ptr.val <= `z` {
948 inst_ptr.val - 32
949 } else {
950 inst_ptr.val
951 }
952 if r1 == r2 {
953 match_ok = true
954 }
955 } else if rn == inst_ptr.val {
956 match_ok = true
957 }
958
959 if match_ok {
960 sp += l
961 inst_ptr++
962 } else {
963 goto backtrack
964 }
965 }
966 .string {
967 if sp + inst_ptr.val_len > str_len {
968 goto backtrack
969 }
970 // Inline memcmp
971 v_str := inst_ptr.val_str.str
972 for i in 0 .. inst_ptr.val_len {
973 if str_ptr[sp + i] != v_str[i] {
974 goto backtrack
975 }
976 }
977 sp += inst_ptr.val_len
978 inst_ptr++
979 }
980 .class {
981 if sp >= str_len {
982 goto backtrack
983 }
984 c_byte := str_ptr[sp]
985 mut matched := false
986 mut cl := 1
987
988 // Optimization: Bitmap lookup for ASCII
989 if c_byte < 128 {
990 if (inst_ptr.bitmap[c_byte >> 5] & (u32(1) << (c_byte & 31))) != 0 {
991 matched = true
992 }
993 } else {
994 rn, l := read_rune_at(str_ptr, str_len, sp)
995 cl = l
996 for cr in inst_ptr.char_class {
997 if cr == rn {
998 matched = true
999 break
1000 }
1001 }
1002 }
1003
1004 if matched != inst_ptr.inverted {
1005 sp += cl
1006 inst_ptr++
1007 } else {
1008 goto backtrack
1009 }
1010 }
1011 .any {
1012 if sp >= str_len {
1013 goto backtrack
1014 }
1015 if inst_ptr.dot_all || str_ptr[sp] != `\n` {
1016 _, cl := read_rune_at(str_ptr, str_len, sp)
1017 sp += cl
1018 inst_ptr++
1019 } else {
1020 goto backtrack
1021 }
1022 }
1023 .save {
1024 cap_ptr[inst_ptr.group_idx] = sp
1025 inst_ptr++
1026 }
1027 .split {
1028 if stack_ptr + frame_size >= stack_max {
1029 new_size := stack_max * 2
1030 if new_size > 1_000_000 {
1031 goto backtrack
1032 }
1033 m.stack.grow_len(new_size)
1034 stack_data = &int(m.stack.data) // Pointer might change on realloc
1035 stack_max = new_size
1036 }
1037
1038 // Optimization: Unrolled stack push
1039 stack_offset := stack_ptr
1040 for i in 0 .. cap_size {
1041 stack_data[stack_offset + i] = cap_ptr[i]
1042 }
1043 stack_data[stack_offset + cap_size] = sp
1044 // Save backtrack target PC index
1045 stack_data[stack_offset + cap_size + 1] = inst_ptr.target_y
1046
1047 stack_ptr += frame_size
1048 // Jump to primary target (convert index to pointer)
1049 // FIX: Use pointer indexing instead of addition to avoid type mismatch
1050 inst_ptr = &prog_start[inst_ptr.target_x]
1051 }
1052 .jmp {
1053 inst_ptr = &prog_start[inst_ptr.target_x]
1054 }
1055 .assert_start {
1056 if sp == 0 {
1057 inst_ptr++
1058 } else {
1059 goto backtrack
1060 }
1061 }
1062 .assert_end {
1063 if sp == str_len {
1064 inst_ptr++
1065 } else {
1066 goto backtrack
1067 }
1068 }
1069 .assert_line_start {
1070 if sp == 0 || (sp > 0 && str_ptr[sp - 1] == `\n`) {
1071 inst_ptr++
1072 } else {
1073 goto backtrack
1074 }
1075 }
1076 .assert_line_end {
1077 if sp == str_len || str_ptr[sp] == `\n` {
1078 inst_ptr++
1079 } else {
1080 goto backtrack
1081 }
1082 }
1083 .assert_bound, .assert_nbound {
1084 l := if sp > 0 { is_word_char(str_ptr[sp - 1]) } else { false }
1085 r_ := if sp < str_len { is_word_char(str_ptr[sp]) } else { false }
1086 if (inst_ptr.typ == .assert_bound && l != r_)
1087 || (inst_ptr.typ == .assert_nbound && l == r_) {
1088 inst_ptr++
1089 } else {
1090 goto backtrack
1091 }
1092 }
1093 }
1094
1095 continue
1096
1097 backtrack:
1098 if stack_ptr <= 0 {
1099 return none
1100 }
1101
1102 stack_ptr -= frame_size
1103 stack_offset := stack_ptr
1104
1105 // Restore captures
1106 for i in 0 .. cap_size {
1107 cap_ptr[i] = stack_data[stack_offset + i]
1108 }
1109 sp = stack_data[stack_offset + cap_size]
1110 // Restore PC from index (explicit cast for pointer arithmetic)
1111 // FIX: Use pointer indexing instead of addition
1112 inst_ptr = &prog_start[stack_data[stack_offset + cap_size + 1]]
1113 }
1114 }
1115 return none
1116}
1117
1118/******************************************************************************
1119*
1120* Public API
1121*
1122******************************************************************************/
1123
1124// find returns the first match of the regex in text.
1125pub fn (r &Regex) find(text string) ?Match {
1126 return r.find_from(text, 0)
1127}
1128
1129// find_from returns the first match starting from start_index.
1130// Optimized with fast prefix skipping and anchor checks.
1131pub fn (r &Regex) find_from(text string, start_index int) ?Match {
1132 if start_index < 0 || start_index > text.len {
1133 return none
1134 }
1135 mut m := r.new_machine()
1136
1137 // Optimization: Anchored pattern (^) only checks the start
1138 if r.anchored {
1139 if start_index == 0 {
1140 return r.vm_match(text, 0, mut m)
1141 }
1142 // If multiline mode is NOT enabled, ^ only matches index 0.
1143 // If multiline is enabled, we need to check every newline, handled by logic below.
1144 // Note: The compiler sets anchored=true only for ^ at start.
1145 // If multiline flag is set dynamically inside pattern, strict anchoring logic might differ,
1146 // but standard ^ usage benefits here.
1147 }
1148
1149 // Optimization: Boyer-Moore-like literal prefix skip
1150 if r.has_prefix {
1151 mut i := text.index_after(r.prefix_lit, start_index) or { -1 }
1152 for i != -1 {
1153 if res := r.vm_match(text, i, mut m) {
1154 return res
1155 }
1156 i = text.index_after(r.prefix_lit, i + 1) or { -1 }
1157 }
1158 return none
1159 }
1160
1161 for i := start_index; i <= text.len; i++ {
1162 // Skip UTF-8 continuation bytes to ensure we only match at rune boundaries
1163 // 0xC0 (11000000) is the start of a multi-byte sequence, 0x80 (10000000) is a continuation
1164 if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1165 continue
1166 }
1167 if res := r.vm_match(text, i, mut m) {
1168 return res
1169 }
1170 }
1171 return none
1172}
1173
1174// find_all returns all non-overlapping matches in text.
1175pub fn (r &Regex) find_all(text string) []Match {
1176 mut matches := []Match{}
1177 mut m := r.new_machine()
1178 mut i := 0
1179 for i <= text.len {
1180 if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 {
1181 i++
1182 continue
1183 }
1184 if res := r.vm_match(text, i, mut m) {
1185 matches << res
1186 i = if res.end > i { res.end } else { i + 1 }
1187 } else {
1188 i++
1189 }
1190 }
1191 return matches
1192}
1193
1194// replace finds the first match and replaces it using repl. Supports $1, $2 backreferences.
1195pub fn (r &Regex) replace(text string, repl string) string {
1196 res := r.find(text) or { return text }
1197 mut sb := strings.new_builder(text.len + repl.len)
1198 sb.write_string(text[0..res.start])
1199 mut i := 0
1200 for i < repl.len {
1201 if repl[i] == `$` && i + 1 < repl.len {
1202 next_char := repl[i + 1]
1203 if next_char >= `0` && next_char <= `9` {
1204 d := next_char - `0` - 1
1205 if d >= 0 && d < res.groups.len {
1206 sb.write_string(res.groups[d])
1207 }
1208 i += 2
1209 continue
1210 }
1211 }
1212 sb.write_u8(repl[i])
1213 i++
1214 }
1215 sb.write_string(text[res.end..])
1216 return sb.str()
1217}
1218
1219// fullmatch returns a Match only if the pattern matches the entire text.
1220pub fn (r &Regex) fullmatch(text string) ?Match {
1221 mut m := r.new_machine()
1222 if res := r.vm_match(text, 0, mut m) {
1223 if res.end == text.len {
1224 return res
1225 }
1226 }
1227 return none
1228}
1229
1230// group_by_name retrieves text captured by a named group.
1231pub fn (r &Regex) group_by_name(m Match, name string) string {
1232 idx := r.group_map[name] or { return '' }
1233 return if idx < m.groups.len { m.groups[idx] } else { '' }
1234}
1235
1236/******************************************************************************
1237*
1238* Compatibility Layer
1239*
1240******************************************************************************/
1241
1242// new_regex is a helper wrapper to compile a regex pattern.
1243pub fn new_regex(pattern string, _ int) !Regex {
1244 return compile(pattern)
1245}
1246
1247// match_str is a compatibility alias for find_from.
1248pub fn (r &Regex) match_str(text string, start_index int, _ int) ?Match {
1249 return r.find_from(text, start_index)
1250}
1251
1252// get returns the matched text for a specific group index.
1253// Index 0 returns the full match, 1..n returns capture groups.
1254pub fn (m Match) get(idx int) ?string {
1255 if idx == 0 {
1256 return m.text
1257 }
1258 if idx > 0 && idx <= m.groups.len {
1259 return m.groups[idx - 1]
1260 }
1261 return none
1262}
1263
1264// get_all returns a list of all captured strings, starting with the full match at index 0.
1265pub fn (m Match) get_all() []string {
1266 mut res := [m.text]
1267 res << m.groups
1268 return res
1269}
1270