| 1 | /* |
| 2 | regex2 0.9.6 beta (VM Edition) - Performance Optimized |
| 3 | |
| 4 | Copyright (c) 2026 Dario Deledda. All rights reserved. |
| 5 | Use of this source code is governed by an MIT license |
| 6 | that can be found in the LICENSE file. |
| 7 | |
| 8 | This file contains a regex module based on a Virtual Machine approach. |
| 9 | |
| 10 | Features: |
| 11 | - **Non-recursive VM**: Safe execution without stack overflow on complex patterns. |
| 12 | - UTF8 support with Fast ASCII path. |
| 13 | - Literal characters, '.', '*', '{m,n}'. |
| 14 | - Short quantifiers: '?', '+'. |
| 15 | - Non-greedy quantifiers: '*?', '+?', '??'. |
| 16 | - Nested groups: '()'. |
| 17 | - Named groups: '(?P<name>...)'. |
| 18 | - Non-capturing groups: '(?:...)'. |
| 19 | - Character classes: \w, \W, \d, \D, \s, \S, \a, \A. |
| 20 | - Hex escapes: \xHH, \XHHHH. |
| 21 | - Alternation: '|'. |
| 22 | - Character classes: [abc], [^abc], [a-z]. |
| 23 | - Anchors: '^' (start of string), '$' (end of string). |
| 24 | - Word boundaries: '\b' (word boundary), '\B' (non-word boundary). |
| 25 | - Inline Flags: |
| 26 | - '(?i)' Case insensitive. |
| 27 | - '(?m)' Multiline (anchors match newlines). |
| 28 | - '(?s)' Dot-all (dot matches newline). |
| 29 | |
| 30 | Configuration: |
| 31 | - `max_stack_depth`: Controls the dynamic growth limit of the backtracking stack. |
| 32 | Default is 2048. Increase this value if you encounter complex patterns failing |
| 33 | on deep recursions/backtracking, or decrease it to limit memory usage. |
| 34 | |
| 35 | Functions: |
| 36 | - `compile(pattern) !Regex` -> Compiles a pattern into a Regex object. |
| 37 | - `(r Regex) find(text) ?Match` -> Finds the first match in a string. |
| 38 | - `(r Regex) find_all(text) []Match` -> Finds all non-overlapping matches. |
| 39 | - `(r Regex) find_from(text, start_index) ?Match` -> Finds the first match starting from a specific index. |
| 40 | - `(r Regex) replace(text, repl) string` -> Finds and replaces the first match. |
| 41 | - `(r Regex) fullmatch(text string) ?Match` -> Match exact strings. |
| 42 | - `(r Regex) group_by_name(m Match, name string) string` -> Get text of a named group. |
| 43 | |
| 44 | Key Architectural Features and Optimizations: |
| 45 | - **Pre-allocated Machine workspace**: All search operations are zero-allocation. |
| 46 | - **Raw pointer access**: Bypasses bounds checking for string and instruction access. |
| 47 | - **Fast ASCII Path**: Characters < 128 bypass heavy UTF-8 decoding. |
| 48 | - **Instruction merging**: Consecutive character matches are merged into string blocks. |
| 49 | - **Bitmap lookups**: ASCII character classes use a 128-bit bitset for O(1) matching. |
| 50 | - **NFA Virtual Machine**: Executes bytecode instructions to simulate pattern matching. |
| 51 | - **Dynamic Stack Growth**: Automatically expands the backtracking stack to prevent false negatives. |
| 52 | - **Zero-Allocation Search**: Reuses a pre-allocated Machine workspace for search operations. |
| 53 | - **Anchored Optimization**: Patterns starting with '^' skip the scanning loop. |
| 54 | - **Prefix Skipping**: Uses Boyer-Moore-like skipping for literal prefixes. |
| 55 | |
| 56 | */ |
| 57 | |
| 58 | module pcre |
| 59 | |
| 60 | import strings |
| 61 | |
| 62 | /****************************************************************************** |
| 63 | * |
| 64 | * VM Instruction Definition |
| 65 | * |
| 66 | ******************************************************************************/ |
| 67 | |
| 68 | // InstType defines the operation codes for the Virtual Machine. |
| 69 | enum InstType as u8 { |
| 70 | match // Halt and signal a successful match. |
| 71 | char // Match a single UTF-8 rune. |
| 72 | string // Match a sequence of ASCII characters (merged optimization). |
| 73 | any // Match any character (dot). |
| 74 | class // Match a character class (e.g., [a-z] or \d). |
| 75 | split // Branch execution: target_x (primary), target_y (backtrack). |
| 76 | jmp // Unconditional jump to target_x. |
| 77 | save // Save current string position into a capture group index. |
| 78 | assert_start // Assert current position is start of string. |
| 79 | assert_end // Assert current position is end of string. |
| 80 | assert_line_start // Assert current position is start of a line (multiline mode). |
| 81 | assert_line_end // Assert current position is end of a line (multiline mode). |
| 82 | assert_bound // Assert word boundary (\b). |
| 83 | assert_nbound // Assert non-word boundary (\B). |
| 84 | } |
| 85 | |
| 86 | // Inst represents a single bytecode instruction. |
| 87 | // Packed for memory locality. |
| 88 | struct Inst { |
| 89 | mut: |
| 90 | typ InstType |
| 91 | val rune // Used by .char |
| 92 | val_str string // Used by .string |
| 93 | val_len int // Byte length of val_str |
| 94 | target_x int // Primary jump/split target |
| 95 | target_y int // Backtrack target for .split |
| 96 | group_idx int // Capture group index for .save |
| 97 | char_class []rune // Literal runes for character classes |
| 98 | bitmap [4]u32 // 128-bit bitset for ASCII character classes |
| 99 | inverted bool // Negation flag for classes |
| 100 | ignore_case bool // Case-insensitive flag |
| 101 | dot_all bool // Whether '.' matches '\n' |
| 102 | } |
| 103 | |
| 104 | // Machine provides a workspace for VM execution. |
| 105 | // To ensure thread safety, this is created per top-level API call. |
| 106 | pub struct Machine { |
| 107 | mut: |
| 108 | stack []int // Backtracking stack stores: [capture_states..., string_ptr, next_pc] |
| 109 | captures []int // Flat array of [start, end] byte indices for groups |
| 110 | } |
| 111 | |
| 112 | // Regex is the compiled regular expression object. |
| 113 | pub struct Regex { |
| 114 | pub: |
| 115 | pattern string // The original regex string |
| 116 | prog []Inst // Compiled bytecode |
| 117 | total_groups int // Number of capture groups defined in pattern |
| 118 | group_map map[string]int // Mapping of names to indices for (?P<name>...) |
| 119 | prefix_lit string // Pre-calculated literal prefix for fast-skip optimization |
| 120 | has_prefix bool // Whether a literal prefix exists |
| 121 | anchored bool // True if pattern starts with '^' (optimization hint) |
| 122 | pub mut: |
| 123 | max_stack_depth int // User-defined stack limit hint |
| 124 | } |
| 125 | |
| 126 | // Match contains the results of a successful regex match. |
| 127 | pub struct Match { |
| 128 | pub: |
| 129 | text string // The full text of the match |
| 130 | start int // Starting byte index in the source string |
| 131 | end int // Ending byte index in the source string |
| 132 | groups []string // Sub-strings captured by groups |
| 133 | } |
| 134 | |
| 135 | // Internal structures for Compilation |
| 136 | struct Quantifier { |
| 137 | mut: |
| 138 | min int |
| 139 | max int // -1 for infinity |
| 140 | greedy bool |
| 141 | } |
| 142 | |
| 143 | struct Flags { |
| 144 | mut: |
| 145 | ignore_case bool |
| 146 | multiline bool |
| 147 | dot_all bool |
| 148 | } |
| 149 | |
| 150 | enum NodeType { |
| 151 | chr |
| 152 | any_char |
| 153 | group |
| 154 | alternation |
| 155 | char_class |
| 156 | start_of_string |
| 157 | end_of_string |
| 158 | word_boundary |
| 159 | non_word_boundary |
| 160 | word_char |
| 161 | non_word_char |
| 162 | digit |
| 163 | non_digit |
| 164 | whitespace |
| 165 | non_whitespace |
| 166 | lowercase_char |
| 167 | uppercase_char |
| 168 | } |
| 169 | |
| 170 | struct Node { |
| 171 | mut: |
| 172 | typ NodeType |
| 173 | chr rune |
| 174 | quant Quantifier |
| 175 | nodes []Node |
| 176 | alternatives [][]Node |
| 177 | group_capture_index int = -1 |
| 178 | char_set []rune |
| 179 | inverted bool |
| 180 | ignore_case bool |
| 181 | multiline bool |
| 182 | dot_all bool |
| 183 | } |
| 184 | |
| 185 | /****************************************************************************** |
| 186 | * |
| 187 | * Internal Utilities (Inlined for Speed) |
| 188 | * |
| 189 | ******************************************************************************/ |
| 190 | |
| 191 | // read_rune_at decodes a UTF-8 rune from a byte pointer safely. |
| 192 | // Marked inline to be embedded directly into the VM loop. |
| 193 | @[inline] |
| 194 | fn read_rune_at(str &u8, len int, index int) (rune, int) { |
| 195 | unsafe { |
| 196 | if index >= len { |
| 197 | return 0, 0 |
| 198 | } |
| 199 | b0 := u32(str[index]) |
| 200 | if b0 < 0x80 { |
| 201 | return rune(b0), 1 |
| 202 | } |
| 203 | if (b0 & 0xE0) == 0xC0 && index + 1 < len { |
| 204 | return rune(((b0 & 0x1F) << 6) | (u32(str[index + 1]) & 0x3F)), 2 |
| 205 | } |
| 206 | if (b0 & 0xF0) == 0xE0 && index + 2 < len { |
| 207 | return rune(((b0 & 0x0F) << 12) | ((u32(str[index + 1]) & 0x3F) << 6) | (u32(str[ |
| 208 | index + 2]) & 0x3F)), 3 |
| 209 | } |
| 210 | if (b0 & 0xF8) == 0xF0 && index + 3 < len { |
| 211 | return rune(((b0 & 0x07) << 18) | ((u32(str[index + 1]) & 0x3F) << 12) | ((u32(str[ |
| 212 | index + 2]) & 0x3F) << 6) | (u32(str[index + 3]) & 0x3F)), 4 |
| 213 | } |
| 214 | } |
| 215 | return 0, 0 |
| 216 | } |
| 217 | |
| 218 | // is_word_char returns true for alphanumeric ASCII characters and underscore. |
| 219 | @[inline] |
| 220 | fn is_word_char(r u8) bool { |
| 221 | return (r >= `a` && r <= `z`) || (r >= `A` && r <= `Z`) || (r >= `0` && r <= `9`) || r == `_` |
| 222 | } |
| 223 | |
| 224 | // set_bitmap sets a specific bit in a 128-bit bitset for ASCII matching. |
| 225 | @[inline] |
| 226 | fn set_bitmap(mut bitmap [4]u32, r rune) { |
| 227 | if r >= 0 && r < 128 { |
| 228 | idx := u32(r) >> 5 |
| 229 | bit := u32(r) & 31 |
| 230 | bitmap[idx] |= (u32(1) << bit) |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | /****************************************************************************** |
| 235 | * |
| 236 | * Compiler Logic |
| 237 | * |
| 238 | ******************************************************************************/ |
| 239 | |
| 240 | // compile transforms a regex pattern string into a Regex object. |
| 241 | pub fn compile(pattern string) !Regex { |
| 242 | mut group_map := map[string]int{} |
| 243 | initial_flags := Flags{false, false, false} |
| 244 | |
| 245 | // Phase 1: AST Parsing |
| 246 | nodes, _, final_group_count := parse_nodes(pattern, 0, `\0`, 0, initial_flags, mut group_map)! |
| 247 | |
| 248 | root := Node{ |
| 249 | typ: .group |
| 250 | nodes: nodes |
| 251 | quant: Quantifier{1, 1, true} |
| 252 | } |
| 253 | |
| 254 | // Phase 2: Bytecode Emission |
| 255 | mut compiler := Compiler{ |
| 256 | prog: []Inst{cap: 128} |
| 257 | } |
| 258 | compiler.emit_node(root) |
| 259 | compiler.emit(Inst{ typ: .match }) |
| 260 | |
| 261 | // Phase 3: Optimization |
| 262 | optimized_prog := compiler.optimize() |
| 263 | |
| 264 | // Detect Prefix and Anchor optimizations |
| 265 | mut prefix := '' |
| 266 | mut has_prefix := false |
| 267 | mut anchored := false |
| 268 | |
| 269 | if optimized_prog.len > 0 { |
| 270 | first := optimized_prog[0] |
| 271 | if first.typ == .string { |
| 272 | prefix = first.val_str |
| 273 | has_prefix = true |
| 274 | } else if first.typ == .char && !first.ignore_case && first.val < 128 { |
| 275 | prefix = unsafe { u8(first.val).ascii_str() } |
| 276 | has_prefix = true |
| 277 | } else if first.typ == .assert_start { |
| 278 | anchored = true |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | return Regex{ |
| 283 | max_stack_depth: 2048 |
| 284 | pattern: pattern |
| 285 | prog: optimized_prog |
| 286 | total_groups: final_group_count |
| 287 | group_map: group_map |
| 288 | prefix_lit: prefix |
| 289 | has_prefix: has_prefix |
| 290 | anchored: anchored |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | // new_machine allocates a new VM state machine. |
| 295 | // This isolates the runtime memory (stack/captures) from the compiled regex, allowing thread-safe usage. |
| 296 | pub fn (r &Regex) new_machine() Machine { |
| 297 | // Pre-allocate enough space for stack and captures to avoid re-allocation in hot path |
| 298 | return Machine{ |
| 299 | stack: []int{len: r.max_stack_depth} |
| 300 | captures: []int{len: r.total_groups * 2} |
| 301 | } |
| 302 | } |
| 303 | |
| 304 | // Compiler holds the state for generating the bytecode instructions. |
| 305 | struct Compiler { |
| 306 | mut: |
| 307 | prog []Inst |
| 308 | } |
| 309 | |
| 310 | // emit appends an instruction to the program and returns its index. |
| 311 | fn (mut c Compiler) emit(i Inst) int { |
| 312 | c.prog << i |
| 313 | return c.prog.len - 1 |
| 314 | } |
| 315 | |
| 316 | // optimize merges consecutive literal characters into single string instructions |
| 317 | // and resolves jump targets to absolute indices. |
| 318 | fn (mut c Compiler) optimize() []Inst { |
| 319 | mut targets := map[int]bool{} |
| 320 | for inst in c.prog { |
| 321 | if inst.typ == .split || inst.typ == .jmp { |
| 322 | targets[inst.target_x] = true |
| 323 | targets[inst.target_y] = true |
| 324 | } |
| 325 | } |
| 326 | |
| 327 | mut new_prog := []Inst{cap: c.prog.len} |
| 328 | mut idx_map := map[int]int{} |
| 329 | |
| 330 | mut i := 0 |
| 331 | for i < c.prog.len { |
| 332 | inst := c.prog[i] |
| 333 | idx_map[i] = new_prog.len |
| 334 | |
| 335 | // Optimization: Merge consecutive chars |
| 336 | if inst.typ == .char && !inst.ignore_case && inst.val < 128 { |
| 337 | mut s_val := unsafe { u8(inst.val).ascii_str() } |
| 338 | mut j := i + 1 |
| 339 | for j < c.prog.len && !targets[j] { |
| 340 | next := c.prog[j] |
| 341 | if next.typ == .char && !next.ignore_case && next.val < 128 { |
| 342 | s_val += unsafe { u8(next.val).ascii_str() } |
| 343 | j++ |
| 344 | } else { |
| 345 | break |
| 346 | } |
| 347 | } |
| 348 | if j > i + 1 { |
| 349 | new_prog << Inst{ |
| 350 | typ: .string |
| 351 | val_str: s_val |
| 352 | val_len: s_val.len |
| 353 | } |
| 354 | for k := i + 1; k < j; k++ { |
| 355 | idx_map[k] = new_prog.len |
| 356 | } |
| 357 | i = j |
| 358 | continue |
| 359 | } |
| 360 | } |
| 361 | new_prog << inst |
| 362 | i++ |
| 363 | } |
| 364 | idx_map[c.prog.len] = new_prog.len |
| 365 | |
| 366 | // Fix jump offsets |
| 367 | for mut inst in new_prog { |
| 368 | if inst.typ == .split || inst.typ == .jmp { |
| 369 | inst.target_x = idx_map[inst.target_x] or { inst.target_x } |
| 370 | inst.target_y = idx_map[inst.target_y] or { inst.target_y } |
| 371 | } |
| 372 | } |
| 373 | return new_prog |
| 374 | } |
| 375 | |
| 376 | // emit_class generates the instructions for a character class node. |
| 377 | // It populates the bitmap for O(1) ASCII matching and the slice for Unicode. |
| 378 | fn (mut c Compiler) emit_class(node Node) { |
| 379 | mut bitmap := [4]u32{} |
| 380 | mut char_class := node.char_set.clone() |
| 381 | mut inverted := node.inverted |
| 382 | |
| 383 | // Pre-compile common classes into the bitmap for O(1) lookups |
| 384 | match node.typ { |
| 385 | .word_char { |
| 386 | for r := `0`; r <= `9`; r++ { |
| 387 | set_bitmap(mut bitmap, r) |
| 388 | } |
| 389 | for r := `a`; r <= `z`; r++ { |
| 390 | set_bitmap(mut bitmap, r) |
| 391 | } |
| 392 | for r := `A`; r <= `Z`; r++ { |
| 393 | set_bitmap(mut bitmap, r) |
| 394 | } |
| 395 | set_bitmap(mut bitmap, `_`) |
| 396 | } |
| 397 | .non_word_char { |
| 398 | inverted = true |
| 399 | for r := `0`; r <= `9`; r++ { |
| 400 | set_bitmap(mut bitmap, r) |
| 401 | } |
| 402 | for r := `a`; r <= `z`; r++ { |
| 403 | set_bitmap(mut bitmap, r) |
| 404 | } |
| 405 | for r := `A`; r <= `Z`; r++ { |
| 406 | set_bitmap(mut bitmap, r) |
| 407 | } |
| 408 | set_bitmap(mut bitmap, `_`) |
| 409 | } |
| 410 | .digit { |
| 411 | for r := `0`; r <= `9`; r++ { |
| 412 | set_bitmap(mut bitmap, r) |
| 413 | } |
| 414 | } |
| 415 | .non_digit { |
| 416 | inverted = true |
| 417 | for r := `0`; r <= `9`; r++ { |
| 418 | set_bitmap(mut bitmap, r) |
| 419 | } |
| 420 | } |
| 421 | .whitespace { |
| 422 | for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] { |
| 423 | set_bitmap(mut bitmap, r) |
| 424 | } |
| 425 | } |
| 426 | .non_whitespace { |
| 427 | inverted = true |
| 428 | for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] { |
| 429 | set_bitmap(mut bitmap, r) |
| 430 | } |
| 431 | } |
| 432 | .lowercase_char { |
| 433 | for r := `a`; r <= `z`; r++ { |
| 434 | set_bitmap(mut bitmap, r) |
| 435 | } |
| 436 | } |
| 437 | .uppercase_char { |
| 438 | for r := `A`; r <= `Z`; r++ { |
| 439 | set_bitmap(mut bitmap, r) |
| 440 | } |
| 441 | } |
| 442 | .char_class { |
| 443 | for r in node.char_set { |
| 444 | set_bitmap(mut bitmap, r) |
| 445 | if node.ignore_case { |
| 446 | if r >= `a` && r <= `z` { |
| 447 | set_bitmap(mut bitmap, r - 32) |
| 448 | } else if r >= `A` && r <= `Z` { |
| 449 | set_bitmap(mut bitmap, r + 32) |
| 450 | } |
| 451 | } |
| 452 | } |
| 453 | } |
| 454 | else {} |
| 455 | } |
| 456 | |
| 457 | c.emit(Inst{ |
| 458 | typ: .class |
| 459 | char_class: char_class |
| 460 | bitmap: bitmap |
| 461 | inverted: inverted |
| 462 | ignore_case: node.ignore_case |
| 463 | }) |
| 464 | } |
| 465 | |
| 466 | // emit_node handles quantifiers and loops, delegating the actual logic to emit_logic. |
| 467 | fn (mut c Compiler) emit_node(node Node) { |
| 468 | for _ in 0 .. node.quant.min { |
| 469 | c.emit_logic(node) |
| 470 | } |
| 471 | |
| 472 | if node.quant.max == -1 { |
| 473 | split_idx := c.emit(Inst{ typ: .split }) |
| 474 | start_pc := c.prog.len |
| 475 | c.emit_logic(node) |
| 476 | c.emit(Inst{ typ: .jmp, target_x: split_idx }) |
| 477 | if node.quant.greedy { |
| 478 | c.prog[split_idx].target_x = start_pc |
| 479 | c.prog[split_idx].target_y = c.prog.len |
| 480 | } else { |
| 481 | c.prog[split_idx].target_x = c.prog.len |
| 482 | c.prog[split_idx].target_y = start_pc |
| 483 | } |
| 484 | } else if node.quant.max > node.quant.min { |
| 485 | rem := node.quant.max - node.quant.min |
| 486 | mut splits := []int{} |
| 487 | for _ in 0 .. rem { |
| 488 | idx := c.emit(Inst{ typ: .split }) |
| 489 | match_pc := c.prog.len |
| 490 | if node.quant.greedy { |
| 491 | c.prog[idx].target_x = match_pc |
| 492 | } else { |
| 493 | c.prog[idx].target_y = match_pc |
| 494 | } |
| 495 | c.emit_logic(node) |
| 496 | splits << idx |
| 497 | } |
| 498 | end_pc := c.prog.len |
| 499 | for idx in splits { |
| 500 | if node.quant.greedy { |
| 501 | c.prog[idx].target_y = end_pc |
| 502 | } else { |
| 503 | c.prog[idx].target_x = end_pc |
| 504 | } |
| 505 | } |
| 506 | } |
| 507 | } |
| 508 | |
| 509 | // emit_logic generates instructions for specific node types (char, group, alternation). |
| 510 | fn (mut c Compiler) emit_logic(node Node) { |
| 511 | match node.typ { |
| 512 | .chr { |
| 513 | c.emit(Inst{ typ: .char, val: node.chr, ignore_case: node.ignore_case }) |
| 514 | } |
| 515 | .any_char { |
| 516 | c.emit(Inst{ typ: .any, dot_all: node.dot_all }) |
| 517 | } |
| 518 | .group { |
| 519 | if node.group_capture_index != -1 { |
| 520 | c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 }) |
| 521 | } |
| 522 | for child in node.nodes { |
| 523 | c.emit_node(child) |
| 524 | } |
| 525 | if node.group_capture_index != -1 { |
| 526 | c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 + 1 }) |
| 527 | } |
| 528 | } |
| 529 | .alternation { |
| 530 | if node.alternatives.len == 0 { |
| 531 | return |
| 532 | } |
| 533 | mut end_jumps := []int{} |
| 534 | for i := 0; i < node.alternatives.len - 1; i++ { |
| 535 | split_idx := c.emit(Inst{ typ: .split }) |
| 536 | c.prog[split_idx].target_x = c.prog.len |
| 537 | for child in node.alternatives[i] { |
| 538 | c.emit_node(child) |
| 539 | } |
| 540 | end_jumps << c.emit(Inst{ typ: .jmp }) |
| 541 | c.prog[split_idx].target_y = c.prog.len |
| 542 | } |
| 543 | for child in node.alternatives.last() { |
| 544 | c.emit_node(child) |
| 545 | } |
| 546 | for idx in end_jumps { |
| 547 | c.prog[idx].target_x = c.prog.len |
| 548 | } |
| 549 | } |
| 550 | .start_of_string { |
| 551 | c.emit(Inst{ |
| 552 | typ: if node.multiline { InstType.assert_line_start } else { InstType.assert_start } |
| 553 | }) |
| 554 | } |
| 555 | .end_of_string { |
| 556 | c.emit(Inst{ |
| 557 | typ: if node.multiline { InstType.assert_line_end } else { InstType.assert_end } |
| 558 | }) |
| 559 | } |
| 560 | .word_boundary { |
| 561 | c.emit(Inst{ typ: .assert_bound }) |
| 562 | } |
| 563 | .non_word_boundary { |
| 564 | c.emit(Inst{ typ: .assert_nbound }) |
| 565 | } |
| 566 | else { |
| 567 | c.emit_class(node) |
| 568 | } |
| 569 | } |
| 570 | } |
| 571 | |
| 572 | // parse_nodes implements a recursive descent parser to construct the AST from the pattern string. |
| 573 | fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_start int, passed_flags Flags, mut group_map map[string]int) !([]Node, int, int) { |
| 574 | mut pos := pos_start |
| 575 | mut group_counter := group_counter_start |
| 576 | mut current_flags := passed_flags |
| 577 | mut alternatives := [][]Node{} |
| 578 | mut current_sequence := []Node{} |
| 579 | |
| 580 | for pos < pattern.len { |
| 581 | chr, char_len := read_rune_at(pattern.str, pattern.len, pos) |
| 582 | if chr == terminator { |
| 583 | pos += char_len |
| 584 | if alternatives.len > 0 { |
| 585 | alternatives << current_sequence |
| 586 | return [ |
| 587 | Node{ |
| 588 | typ: .alternation |
| 589 | alternatives: alternatives |
| 590 | quant: Quantifier{1, 1, true} |
| 591 | }, |
| 592 | ], pos, group_counter |
| 593 | } |
| 594 | return current_sequence, pos, group_counter |
| 595 | } |
| 596 | if chr == `|` { |
| 597 | pos += char_len |
| 598 | if current_sequence.len == 0 { |
| 599 | return error('Empty alternative') |
| 600 | } |
| 601 | alternatives << current_sequence |
| 602 | current_sequence = []Node{} |
| 603 | continue |
| 604 | } |
| 605 | |
| 606 | mut parsed_nodes := []Node{} |
| 607 | match chr { |
| 608 | `^` { |
| 609 | pos += char_len |
| 610 | parsed_nodes << Node{ |
| 611 | typ: .start_of_string |
| 612 | multiline: current_flags.multiline |
| 613 | } |
| 614 | } |
| 615 | `$` { |
| 616 | pos += char_len |
| 617 | parsed_nodes << Node{ |
| 618 | typ: .end_of_string |
| 619 | multiline: current_flags.multiline |
| 620 | } |
| 621 | } |
| 622 | `.` { |
| 623 | pos += char_len |
| 624 | parsed_nodes << Node{ |
| 625 | typ: .any_char |
| 626 | dot_all: current_flags.dot_all |
| 627 | } |
| 628 | } |
| 629 | `*`, `+`, `?`, `{` { |
| 630 | return error('Quantifier must follow a token') |
| 631 | } |
| 632 | `(` { |
| 633 | pos += char_len |
| 634 | mut idx := -1 |
| 635 | mut cap := true |
| 636 | if pos < pattern.len && pattern[pos] == `?` { |
| 637 | pos++ |
| 638 | if pos < pattern.len && pattern[pos] in [`i`, `m`, `s`] { |
| 639 | for pos < pattern.len && pattern[pos] != `)` { |
| 640 | match pattern[pos] { |
| 641 | `i` { current_flags.ignore_case = true } |
| 642 | `m` { current_flags.multiline = true } |
| 643 | `s` { current_flags.dot_all = true } |
| 644 | else {} |
| 645 | } |
| 646 | |
| 647 | pos++ |
| 648 | } |
| 649 | if pos < pattern.len { |
| 650 | pos++ |
| 651 | } |
| 652 | continue |
| 653 | } else if pos < pattern.len && pattern[pos] == `:` { |
| 654 | cap = false |
| 655 | pos++ |
| 656 | } else if pos < pattern.len && pattern[pos] == `P` { |
| 657 | pos++ |
| 658 | if pos >= pattern.len || pattern[pos] != `<` { |
| 659 | return error('Invalid named group syntax') |
| 660 | } |
| 661 | pos++ |
| 662 | end := pattern.index_after('>', pos) or { -1 } |
| 663 | if end == -1 { |
| 664 | return error('Unclosed named group') |
| 665 | } |
| 666 | name := pattern[pos..end] |
| 667 | idx = group_counter |
| 668 | group_map[name] = idx |
| 669 | pos = end + 1 |
| 670 | } |
| 671 | } |
| 672 | if cap { |
| 673 | if idx == -1 { |
| 674 | idx = group_counter |
| 675 | } |
| 676 | group_counter++ |
| 677 | } |
| 678 | sub, new_p, new_c := parse_nodes(pattern, pos, `)`, group_counter, current_flags, mut |
| 679 | group_map)! |
| 680 | pos = new_p |
| 681 | group_counter = new_c |
| 682 | parsed_nodes << Node{ |
| 683 | typ: .group |
| 684 | nodes: sub |
| 685 | group_capture_index: idx |
| 686 | } |
| 687 | } |
| 688 | `[` { |
| 689 | pos += char_len |
| 690 | end := pattern.index_after(']', pos) or { -1 } |
| 691 | if end == -1 { |
| 692 | return error('Unclosed character class') |
| 693 | } |
| 694 | content := pattern[pos..end] |
| 695 | pos = end + 1 |
| 696 | inv := content.len > 0 && content[0] == `^` |
| 697 | mut set := []rune{} |
| 698 | mut i := if inv { 1 } else { 0 } |
| 699 | for i < content.len { |
| 700 | c, l := read_rune_at(content.str, content.len, i) |
| 701 | if i + l + 1 < content.len && content[i + l] == `-` { |
| 702 | ec, el := read_rune_at(content.str, content.len, i + l + 1) |
| 703 | for r := c; r <= ec; r++ { |
| 704 | set << r |
| 705 | } |
| 706 | i += l + 1 + el |
| 707 | } else { |
| 708 | set << c |
| 709 | i += l |
| 710 | } |
| 711 | } |
| 712 | parsed_nodes << Node{ |
| 713 | typ: .char_class |
| 714 | char_set: set |
| 715 | inverted: inv |
| 716 | ignore_case: current_flags.ignore_case |
| 717 | } |
| 718 | } |
| 719 | `\\` { |
| 720 | pos += char_len |
| 721 | if pos >= pattern.len { |
| 722 | return error('Trailing backslash') |
| 723 | } |
| 724 | esc, el := read_rune_at(pattern.str, pattern.len, pos) |
| 725 | pos += el |
| 726 | match esc { |
| 727 | `w` { |
| 728 | parsed_nodes << Node{ |
| 729 | typ: .word_char |
| 730 | } |
| 731 | } |
| 732 | `W` { |
| 733 | parsed_nodes << Node{ |
| 734 | typ: .non_word_char |
| 735 | } |
| 736 | } |
| 737 | `d` { |
| 738 | parsed_nodes << Node{ |
| 739 | typ: .digit |
| 740 | } |
| 741 | } |
| 742 | `D` { |
| 743 | parsed_nodes << Node{ |
| 744 | typ: .non_digit |
| 745 | } |
| 746 | } |
| 747 | `s` { |
| 748 | parsed_nodes << Node{ |
| 749 | typ: .whitespace |
| 750 | } |
| 751 | } |
| 752 | `S` { |
| 753 | parsed_nodes << Node{ |
| 754 | typ: .non_whitespace |
| 755 | } |
| 756 | } |
| 757 | `b` { |
| 758 | parsed_nodes << Node{ |
| 759 | typ: .word_boundary |
| 760 | } |
| 761 | } |
| 762 | `B` { |
| 763 | parsed_nodes << Node{ |
| 764 | typ: .non_word_boundary |
| 765 | } |
| 766 | } |
| 767 | `a` { |
| 768 | parsed_nodes << Node{ |
| 769 | typ: .lowercase_char |
| 770 | } |
| 771 | } |
| 772 | `A` { |
| 773 | parsed_nodes << Node{ |
| 774 | typ: .uppercase_char |
| 775 | } |
| 776 | } |
| 777 | else { |
| 778 | parsed_nodes << Node{ |
| 779 | typ: .chr |
| 780 | chr: esc |
| 781 | ignore_case: current_flags.ignore_case |
| 782 | } |
| 783 | } |
| 784 | } |
| 785 | } |
| 786 | else { |
| 787 | pos += char_len |
| 788 | parsed_nodes << Node{ |
| 789 | typ: .chr |
| 790 | chr: chr |
| 791 | ignore_case: current_flags.ignore_case |
| 792 | } |
| 793 | } |
| 794 | } |
| 795 | |
| 796 | if parsed_nodes.len > 0 { |
| 797 | mut q := Quantifier{1, 1, true} |
| 798 | if pos < pattern.len { |
| 799 | peek := pattern[pos] |
| 800 | match peek { |
| 801 | `*` { |
| 802 | q = Quantifier{0, -1, true} |
| 803 | pos++ |
| 804 | } |
| 805 | `+` { |
| 806 | q = Quantifier{1, -1, true} |
| 807 | pos++ |
| 808 | } |
| 809 | `?` { |
| 810 | q = Quantifier{0, 1, true} |
| 811 | pos++ |
| 812 | } |
| 813 | `{` { |
| 814 | end := pattern.index_after('}', pos) or { -1 } |
| 815 | if end == -1 { |
| 816 | return error('Unclosed quantifier') |
| 817 | } |
| 818 | parts := pattern[pos + 1..end].split(',') |
| 819 | min := parts[0].int() |
| 820 | max := if parts.len > 1 { |
| 821 | if parts[1] == '' { -1 } else { parts[1].int() } |
| 822 | } else { |
| 823 | min |
| 824 | } |
| 825 | q = Quantifier{min, max, true} |
| 826 | pos = end + 1 |
| 827 | } |
| 828 | else {} |
| 829 | } |
| 830 | |
| 831 | if pos < pattern.len && pattern[pos] == `?` { |
| 832 | q.greedy = false |
| 833 | pos++ |
| 834 | } |
| 835 | } |
| 836 | parsed_nodes.last().quant = q |
| 837 | } |
| 838 | current_sequence << parsed_nodes |
| 839 | } |
| 840 | |
| 841 | if alternatives.len > 0 { |
| 842 | if current_sequence.len == 0 { |
| 843 | return error('Empty alternative at end of pattern') |
| 844 | } |
| 845 | alternatives << current_sequence |
| 846 | return [ |
| 847 | Node{ |
| 848 | typ: .alternation |
| 849 | alternatives: alternatives |
| 850 | quant: Quantifier{1, 1, true} |
| 851 | }, |
| 852 | ], pos, group_counter |
| 853 | } |
| 854 | return current_sequence, pos, group_counter |
| 855 | } |
| 856 | |
| 857 | /****************************************************************************** |
| 858 | * |
| 859 | * Virtual Machine Execution Engine (Highly Optimized) |
| 860 | * |
| 861 | ******************************************************************************/ |
| 862 | |
| 863 | // vm_match executes the bytecode against the input string using the provided Machine state. |
| 864 | // OPTIMIZATION: Uses raw pointers for instruction and stack access to bypass bounds checking. |
| 865 | @[direct_array_access] |
| 866 | fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match { |
| 867 | unsafe { |
| 868 | // Optimization: Cast voidptr to typed pointer for direct indexing |
| 869 | mut cap_ptr := &int(m.captures.data) |
| 870 | cap_len := m.captures.len |
| 871 | |
| 872 | // Fast clear of captures using pointer arithmetic (memset-like) |
| 873 | for i := 0; i < cap_len; i++ { |
| 874 | cap_ptr[i] = -1 |
| 875 | } |
| 876 | |
| 877 | mut sp := start_pos // String Pointer Index |
| 878 | mut stack_ptr := 0 // Stack Pointer Offset |
| 879 | |
| 880 | cap_size := r.total_groups * 2 |
| 881 | frame_size := cap_size + 2 // [captures..., saved_sp, saved_pc] |
| 882 | |
| 883 | // Raw pointers for hot path access |
| 884 | prog_start := &Inst(r.prog.data) |
| 885 | mut inst_ptr := prog_start // PC as a pointer |
| 886 | |
| 887 | str_ptr := text.str |
| 888 | str_len := text.len |
| 889 | |
| 890 | // Cache stack data pointer (cast to typed pointer) |
| 891 | mut stack_data := &int(m.stack.data) |
| 892 | mut stack_max := m.stack.len |
| 893 | |
| 894 | for { |
| 895 | // Check if we walked off the program (should be caught by match inst) |
| 896 | // Using pointer arithmetic: offset = (inst_ptr - prog_start) |
| 897 | |
| 898 | match inst_ptr.typ { |
| 899 | .match { |
| 900 | // Only allocate result strings on successful match |
| 901 | mut s_groups := []string{cap: r.total_groups} |
| 902 | for i := 0; i < r.total_groups; i++ { |
| 903 | s, e := cap_ptr[i * 2], cap_ptr[i * 2 + 1] |
| 904 | s_groups << if s != -1 && e >= s { text[s..e] } else { '' } |
| 905 | } |
| 906 | return Match{ |
| 907 | text: text[start_pos..sp] |
| 908 | start: start_pos |
| 909 | end: sp |
| 910 | groups: s_groups |
| 911 | } |
| 912 | } |
| 913 | .char { |
| 914 | if sp >= str_len { |
| 915 | goto backtrack |
| 916 | } |
| 917 | curr_byte := str_ptr[sp] |
| 918 | |
| 919 | // Fast ASCII path |
| 920 | if curr_byte < 128 && inst_ptr.val < 128 { |
| 921 | mut c1, mut c2 := curr_byte, u8(inst_ptr.val) |
| 922 | if inst_ptr.ignore_case { |
| 923 | if c1 >= `a` && c1 <= `z` { |
| 924 | c1 -= 32 |
| 925 | } |
| 926 | if c2 >= `a` && c2 <= `z` { |
| 927 | c2 -= 32 |
| 928 | } |
| 929 | } |
| 930 | if c1 == c2 { |
| 931 | sp++ |
| 932 | inst_ptr++ |
| 933 | continue |
| 934 | } |
| 935 | goto backtrack |
| 936 | } |
| 937 | |
| 938 | // UTF-8 Path |
| 939 | rn, l := read_rune_at(str_ptr, str_len, sp) |
| 940 | if l == 0 { |
| 941 | goto backtrack |
| 942 | } |
| 943 | |
| 944 | mut match_ok := false |
| 945 | if inst_ptr.ignore_case { |
| 946 | r1 := if rn >= `a` && rn <= `z` { rn - 32 } else { rn } |
| 947 | r2 := if inst_ptr.val >= `a` && inst_ptr.val <= `z` { |
| 948 | inst_ptr.val - 32 |
| 949 | } else { |
| 950 | inst_ptr.val |
| 951 | } |
| 952 | if r1 == r2 { |
| 953 | match_ok = true |
| 954 | } |
| 955 | } else if rn == inst_ptr.val { |
| 956 | match_ok = true |
| 957 | } |
| 958 | |
| 959 | if match_ok { |
| 960 | sp += l |
| 961 | inst_ptr++ |
| 962 | } else { |
| 963 | goto backtrack |
| 964 | } |
| 965 | } |
| 966 | .string { |
| 967 | if sp + inst_ptr.val_len > str_len { |
| 968 | goto backtrack |
| 969 | } |
| 970 | // Inline memcmp |
| 971 | v_str := inst_ptr.val_str.str |
| 972 | for i in 0 .. inst_ptr.val_len { |
| 973 | if str_ptr[sp + i] != v_str[i] { |
| 974 | goto backtrack |
| 975 | } |
| 976 | } |
| 977 | sp += inst_ptr.val_len |
| 978 | inst_ptr++ |
| 979 | } |
| 980 | .class { |
| 981 | if sp >= str_len { |
| 982 | goto backtrack |
| 983 | } |
| 984 | c_byte := str_ptr[sp] |
| 985 | mut matched := false |
| 986 | mut cl := 1 |
| 987 | |
| 988 | // Optimization: Bitmap lookup for ASCII |
| 989 | if c_byte < 128 { |
| 990 | if (inst_ptr.bitmap[c_byte >> 5] & (u32(1) << (c_byte & 31))) != 0 { |
| 991 | matched = true |
| 992 | } |
| 993 | } else { |
| 994 | rn, l := read_rune_at(str_ptr, str_len, sp) |
| 995 | cl = l |
| 996 | for cr in inst_ptr.char_class { |
| 997 | if cr == rn { |
| 998 | matched = true |
| 999 | break |
| 1000 | } |
| 1001 | } |
| 1002 | } |
| 1003 | |
| 1004 | if matched != inst_ptr.inverted { |
| 1005 | sp += cl |
| 1006 | inst_ptr++ |
| 1007 | } else { |
| 1008 | goto backtrack |
| 1009 | } |
| 1010 | } |
| 1011 | .any { |
| 1012 | if sp >= str_len { |
| 1013 | goto backtrack |
| 1014 | } |
| 1015 | if inst_ptr.dot_all || str_ptr[sp] != `\n` { |
| 1016 | _, cl := read_rune_at(str_ptr, str_len, sp) |
| 1017 | sp += cl |
| 1018 | inst_ptr++ |
| 1019 | } else { |
| 1020 | goto backtrack |
| 1021 | } |
| 1022 | } |
| 1023 | .save { |
| 1024 | cap_ptr[inst_ptr.group_idx] = sp |
| 1025 | inst_ptr++ |
| 1026 | } |
| 1027 | .split { |
| 1028 | if stack_ptr + frame_size >= stack_max { |
| 1029 | new_size := stack_max * 2 |
| 1030 | if new_size > 1_000_000 { |
| 1031 | goto backtrack |
| 1032 | } |
| 1033 | m.stack.grow_len(new_size) |
| 1034 | stack_data = &int(m.stack.data) // Pointer might change on realloc |
| 1035 | stack_max = new_size |
| 1036 | } |
| 1037 | |
| 1038 | // Optimization: Unrolled stack push |
| 1039 | stack_offset := stack_ptr |
| 1040 | for i in 0 .. cap_size { |
| 1041 | stack_data[stack_offset + i] = cap_ptr[i] |
| 1042 | } |
| 1043 | stack_data[stack_offset + cap_size] = sp |
| 1044 | // Save backtrack target PC index |
| 1045 | stack_data[stack_offset + cap_size + 1] = inst_ptr.target_y |
| 1046 | |
| 1047 | stack_ptr += frame_size |
| 1048 | // Jump to primary target (convert index to pointer) |
| 1049 | // FIX: Use pointer indexing instead of addition to avoid type mismatch |
| 1050 | inst_ptr = &prog_start[inst_ptr.target_x] |
| 1051 | } |
| 1052 | .jmp { |
| 1053 | inst_ptr = &prog_start[inst_ptr.target_x] |
| 1054 | } |
| 1055 | .assert_start { |
| 1056 | if sp == 0 { |
| 1057 | inst_ptr++ |
| 1058 | } else { |
| 1059 | goto backtrack |
| 1060 | } |
| 1061 | } |
| 1062 | .assert_end { |
| 1063 | if sp == str_len { |
| 1064 | inst_ptr++ |
| 1065 | } else { |
| 1066 | goto backtrack |
| 1067 | } |
| 1068 | } |
| 1069 | .assert_line_start { |
| 1070 | if sp == 0 || (sp > 0 && str_ptr[sp - 1] == `\n`) { |
| 1071 | inst_ptr++ |
| 1072 | } else { |
| 1073 | goto backtrack |
| 1074 | } |
| 1075 | } |
| 1076 | .assert_line_end { |
| 1077 | if sp == str_len || str_ptr[sp] == `\n` { |
| 1078 | inst_ptr++ |
| 1079 | } else { |
| 1080 | goto backtrack |
| 1081 | } |
| 1082 | } |
| 1083 | .assert_bound, .assert_nbound { |
| 1084 | l := if sp > 0 { is_word_char(str_ptr[sp - 1]) } else { false } |
| 1085 | r_ := if sp < str_len { is_word_char(str_ptr[sp]) } else { false } |
| 1086 | if (inst_ptr.typ == .assert_bound && l != r_) |
| 1087 | || (inst_ptr.typ == .assert_nbound && l == r_) { |
| 1088 | inst_ptr++ |
| 1089 | } else { |
| 1090 | goto backtrack |
| 1091 | } |
| 1092 | } |
| 1093 | } |
| 1094 | |
| 1095 | continue |
| 1096 | |
| 1097 | backtrack: |
| 1098 | if stack_ptr <= 0 { |
| 1099 | return none |
| 1100 | } |
| 1101 | |
| 1102 | stack_ptr -= frame_size |
| 1103 | stack_offset := stack_ptr |
| 1104 | |
| 1105 | // Restore captures |
| 1106 | for i in 0 .. cap_size { |
| 1107 | cap_ptr[i] = stack_data[stack_offset + i] |
| 1108 | } |
| 1109 | sp = stack_data[stack_offset + cap_size] |
| 1110 | // Restore PC from index (explicit cast for pointer arithmetic) |
| 1111 | // FIX: Use pointer indexing instead of addition |
| 1112 | inst_ptr = &prog_start[stack_data[stack_offset + cap_size + 1]] |
| 1113 | } |
| 1114 | } |
| 1115 | return none |
| 1116 | } |
| 1117 | |
| 1118 | /****************************************************************************** |
| 1119 | * |
| 1120 | * Public API |
| 1121 | * |
| 1122 | ******************************************************************************/ |
| 1123 | |
| 1124 | // find returns the first match of the regex in text. |
| 1125 | pub fn (r &Regex) find(text string) ?Match { |
| 1126 | return r.find_from(text, 0) |
| 1127 | } |
| 1128 | |
| 1129 | // find_from returns the first match starting from start_index. |
| 1130 | // Optimized with fast prefix skipping and anchor checks. |
| 1131 | pub fn (r &Regex) find_from(text string, start_index int) ?Match { |
| 1132 | if start_index < 0 || start_index > text.len { |
| 1133 | return none |
| 1134 | } |
| 1135 | mut m := r.new_machine() |
| 1136 | |
| 1137 | // Optimization: Anchored pattern (^) only checks the start |
| 1138 | if r.anchored { |
| 1139 | if start_index == 0 { |
| 1140 | return r.vm_match(text, 0, mut m) |
| 1141 | } |
| 1142 | // If multiline mode is NOT enabled, ^ only matches index 0. |
| 1143 | // If multiline is enabled, we need to check every newline, handled by logic below. |
| 1144 | // Note: The compiler sets anchored=true only for ^ at start. |
| 1145 | // If multiline flag is set dynamically inside pattern, strict anchoring logic might differ, |
| 1146 | // but standard ^ usage benefits here. |
| 1147 | } |
| 1148 | |
| 1149 | // Optimization: Boyer-Moore-like literal prefix skip |
| 1150 | if r.has_prefix { |
| 1151 | mut i := text.index_after(r.prefix_lit, start_index) or { -1 } |
| 1152 | for i != -1 { |
| 1153 | if res := r.vm_match(text, i, mut m) { |
| 1154 | return res |
| 1155 | } |
| 1156 | i = text.index_after(r.prefix_lit, i + 1) or { -1 } |
| 1157 | } |
| 1158 | return none |
| 1159 | } |
| 1160 | |
| 1161 | for i := start_index; i <= text.len; i++ { |
| 1162 | // Skip UTF-8 continuation bytes to ensure we only match at rune boundaries |
| 1163 | // 0xC0 (11000000) is the start of a multi-byte sequence, 0x80 (10000000) is a continuation |
| 1164 | if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 { |
| 1165 | continue |
| 1166 | } |
| 1167 | if res := r.vm_match(text, i, mut m) { |
| 1168 | return res |
| 1169 | } |
| 1170 | } |
| 1171 | return none |
| 1172 | } |
| 1173 | |
| 1174 | // find_all returns all non-overlapping matches in text. |
| 1175 | pub fn (r &Regex) find_all(text string) []Match { |
| 1176 | mut matches := []Match{} |
| 1177 | mut m := r.new_machine() |
| 1178 | mut i := 0 |
| 1179 | for i <= text.len { |
| 1180 | if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 { |
| 1181 | i++ |
| 1182 | continue |
| 1183 | } |
| 1184 | if res := r.vm_match(text, i, mut m) { |
| 1185 | matches << res |
| 1186 | i = if res.end > i { res.end } else { i + 1 } |
| 1187 | } else { |
| 1188 | i++ |
| 1189 | } |
| 1190 | } |
| 1191 | return matches |
| 1192 | } |
| 1193 | |
| 1194 | // replace finds the first match and replaces it using repl. Supports $1, $2 backreferences. |
| 1195 | pub fn (r &Regex) replace(text string, repl string) string { |
| 1196 | res := r.find(text) or { return text } |
| 1197 | mut sb := strings.new_builder(text.len + repl.len) |
| 1198 | sb.write_string(text[0..res.start]) |
| 1199 | mut i := 0 |
| 1200 | for i < repl.len { |
| 1201 | if repl[i] == `$` && i + 1 < repl.len { |
| 1202 | next_char := repl[i + 1] |
| 1203 | if next_char >= `0` && next_char <= `9` { |
| 1204 | d := next_char - `0` - 1 |
| 1205 | if d >= 0 && d < res.groups.len { |
| 1206 | sb.write_string(res.groups[d]) |
| 1207 | } |
| 1208 | i += 2 |
| 1209 | continue |
| 1210 | } |
| 1211 | } |
| 1212 | sb.write_u8(repl[i]) |
| 1213 | i++ |
| 1214 | } |
| 1215 | sb.write_string(text[res.end..]) |
| 1216 | return sb.str() |
| 1217 | } |
| 1218 | |
| 1219 | // fullmatch returns a Match only if the pattern matches the entire text. |
| 1220 | pub fn (r &Regex) fullmatch(text string) ?Match { |
| 1221 | mut m := r.new_machine() |
| 1222 | if res := r.vm_match(text, 0, mut m) { |
| 1223 | if res.end == text.len { |
| 1224 | return res |
| 1225 | } |
| 1226 | } |
| 1227 | return none |
| 1228 | } |
| 1229 | |
| 1230 | // group_by_name retrieves text captured by a named group. |
| 1231 | pub fn (r &Regex) group_by_name(m Match, name string) string { |
| 1232 | idx := r.group_map[name] or { return '' } |
| 1233 | return if idx < m.groups.len { m.groups[idx] } else { '' } |
| 1234 | } |
| 1235 | |
| 1236 | /****************************************************************************** |
| 1237 | * |
| 1238 | * Compatibility Layer |
| 1239 | * |
| 1240 | ******************************************************************************/ |
| 1241 | |
| 1242 | // new_regex is a helper wrapper to compile a regex pattern. |
| 1243 | pub fn new_regex(pattern string, _ int) !Regex { |
| 1244 | return compile(pattern) |
| 1245 | } |
| 1246 | |
| 1247 | // match_str is a compatibility alias for find_from. |
| 1248 | pub fn (r &Regex) match_str(text string, start_index int, _ int) ?Match { |
| 1249 | return r.find_from(text, start_index) |
| 1250 | } |
| 1251 | |
| 1252 | // get returns the matched text for a specific group index. |
| 1253 | // Index 0 returns the full match, 1..n returns capture groups. |
| 1254 | pub fn (m Match) get(idx int) ?string { |
| 1255 | if idx == 0 { |
| 1256 | return m.text |
| 1257 | } |
| 1258 | if idx > 0 && idx <= m.groups.len { |
| 1259 | return m.groups[idx - 1] |
| 1260 | } |
| 1261 | return none |
| 1262 | } |
| 1263 | |
| 1264 | // get_all returns a list of all captured strings, starting with the full match at index 0. |
| 1265 | pub fn (m Match) get_all() []string { |
| 1266 | mut res := [m.text] |
| 1267 | res << m.groups |
| 1268 | return res |
| 1269 | } |
| 1270 | |