| 1 | /* |
| 2 | regex2 0.9.6 beta (VM Edition) - Performance Optimized |
| 3 | |
| 4 | Copyright (c) 2026 Dario Deledda. All rights reserved. |
| 5 | Use of this source code is governed by an MIT license |
| 6 | that can be found in the LICENSE file. |
| 7 | |
| 8 | This file contains a regex module based on a Virtual Machine approach. |
| 9 | |
| 10 | Features: |
| 11 | - **Non-recursive VM**: Safe execution without stack overflow on complex patterns. |
| 12 | - UTF8 support with Fast ASCII path. |
| 13 | - Literal characters, '.', '*', '{m,n}'. |
| 14 | - Short quantifiers: '?', '+'. |
| 15 | - Non-greedy quantifiers: '*?', '+?', '??'. |
| 16 | - Nested groups: '()'. |
| 17 | - Named groups: '(?P<name>...)'. |
| 18 | - Non-capturing groups: '(?:...)'. |
| 19 | - Character classes: \w, \W, \d, \D, \s, \S, \a, \A. |
| 20 | - Hex escapes: \xHH, \XHHHH. |
| 21 | - Alternation: '|'. |
| 22 | - Character classes: [abc], [^abc], [a-z]. |
| 23 | - Anchors: '^' (start of string), '$' (end of string). |
| 24 | - Word boundaries: '\b' (word boundary), '\B' (non-word boundary). |
| 25 | - Inline Flags: |
| 26 | - '(?i)' Case insensitive. |
| 27 | - '(?m)' Multiline (anchors match newlines). |
| 28 | - '(?s)' Dot-all (dot matches newline). |
| 29 | |
| 30 | Configuration: |
| 31 | - `max_stack_depth`: Controls the dynamic growth limit of the backtracking stack. |
| 32 | Default is 2048. Increase this value if you encounter complex patterns failing |
| 33 | on deep recursions/backtracking, or decrease it to limit memory usage. |
| 34 | |
| 35 | Functions: |
| 36 | - `compile(pattern) !Regex` -> Compiles a pattern into a Regex object. |
| 37 | - `(r Regex) find(text) ?Match` -> Finds the first match in a string. |
| 38 | - `(r Regex) find_all(text) []Match` -> Finds all non-overlapping matches. |
| 39 | - `(r Regex) find_from(text, start_index) ?Match` -> Finds the first match starting from a specific index. |
| 40 | - `(r Regex) replace(text, repl) string` -> Finds and replaces the first match. |
| 41 | - `(r Regex) fullmatch(text string) ?Match` -> Match exact strings. |
| 42 | - `(r Regex) group_by_name(m Match, name string) string` -> Get text of a named group. |
| 43 | |
| 44 | Key Architectural Features and Optimizations: |
| 45 | - **Pre-allocated Machine workspace**: All search operations are zero-allocation. |
| 46 | - **Raw pointer access**: Bypasses bounds checking for string and instruction access. |
| 47 | - **Fast ASCII Path**: Characters < 128 bypass heavy UTF-8 decoding. |
| 48 | - **Instruction merging**: Consecutive character matches are merged into string blocks. |
| 49 | - **Bitmap lookups**: ASCII character classes use a 128-bit bitset for O(1) matching. |
| 50 | - **NFA Virtual Machine**: Executes bytecode instructions to simulate pattern matching. |
| 51 | - **Dynamic Stack Growth**: Automatically expands the backtracking stack to prevent false negatives. |
| 52 | - **Zero-Allocation Search**: Reuses a pre-allocated Machine workspace for search operations. |
| 53 | - **Anchored Optimization**: Patterns starting with '^' skip the scanning loop. |
| 54 | - **Prefix Skipping**: Uses Boyer-Moore-like skipping for literal prefixes. |
| 55 | |
| 56 | */ |
| 57 | |
| 58 | module pcre |
| 59 | |
| 60 | import strconv |
| 61 | import strings |
| 62 | |
| 63 | /****************************************************************************** |
| 64 | * |
| 65 | * VM Instruction Definition |
| 66 | * |
| 67 | ******************************************************************************/ |
| 68 | |
| 69 | // InstType defines the operation codes for the Virtual Machine. |
| 70 | enum InstType as u8 { |
| 71 | match // Halt and signal a successful match. |
| 72 | char // Match a single UTF-8 rune. |
| 73 | string // Match a sequence of ASCII characters (merged optimization). |
| 74 | any // Match any character (dot). |
| 75 | class // Match a character class (e.g., [a-z] or \d). |
| 76 | split // Branch execution: target_x (primary), target_y (backtrack). |
| 77 | jmp // Unconditional jump to target_x. |
| 78 | save // Save current string position into a capture group index. |
| 79 | assert_start // Assert current position is start of string. |
| 80 | assert_end // Assert current position is end of string. |
| 81 | assert_line_start // Assert current position is start of a line (multiline mode). |
| 82 | assert_line_end // Assert current position is end of a line (multiline mode). |
| 83 | assert_bound // Assert word boundary (\b). |
| 84 | assert_nbound // Assert non-word boundary (\B). |
| 85 | } |
| 86 | |
| 87 | // Inst represents a single bytecode instruction. |
| 88 | // Packed for memory locality. |
| 89 | struct Inst { |
| 90 | mut: |
| 91 | typ InstType |
| 92 | val rune // Used by .char |
| 93 | val_str string // Used by .string |
| 94 | val_len int // Byte length of val_str |
| 95 | target_x int // Primary jump/split target |
| 96 | target_y int // Backtrack target for .split |
| 97 | group_idx int // Capture group index for .save |
| 98 | char_class []rune // Literal runes for character classes |
| 99 | bitmap [4]u32 // 128-bit bitset for ASCII character classes |
| 100 | inverted bool // Negation flag for classes |
| 101 | ignore_case bool // Case-insensitive flag |
| 102 | dot_all bool // Whether '.' matches '\n' |
| 103 | } |
| 104 | |
| 105 | // Machine provides a workspace for VM execution. |
| 106 | // To ensure thread safety, this is created per top-level API call. |
| 107 | pub struct Machine { |
| 108 | mut: |
| 109 | stack []int // Backtracking stack stores: [capture_states..., string_ptr, next_pc] |
| 110 | captures []int // Flat array of [start, end] byte indices for groups |
| 111 | } |
| 112 | |
| 113 | // Regex is the compiled regular expression object. |
| 114 | pub struct Regex { |
| 115 | pub: |
| 116 | pattern string // The original regex string |
| 117 | prog []Inst // Compiled bytecode |
| 118 | total_groups int // Number of capture groups defined in pattern |
| 119 | group_map map[string]int // Mapping of names to indices for (?P<name>...) |
| 120 | prefix_lit string // Pre-calculated literal prefix for fast-skip optimization |
| 121 | has_prefix bool // Whether a literal prefix exists |
| 122 | anchored bool // True if pattern starts with '^' (optimization hint) |
| 123 | pub mut: |
| 124 | max_stack_depth int // User-defined stack limit hint |
| 125 | } |
| 126 | |
| 127 | // Match contains the results of a successful regex match. |
| 128 | pub struct Match { |
| 129 | pub: |
| 130 | text string // The full text of the match |
| 131 | start int // Starting byte index in the source string |
| 132 | end int // Ending byte index in the source string |
| 133 | groups []string // Sub-strings captured by groups |
| 134 | } |
| 135 | |
| 136 | // Internal structures for Compilation |
| 137 | struct Quantifier { |
| 138 | mut: |
| 139 | min int |
| 140 | max int // -1 for infinity |
| 141 | greedy bool |
| 142 | } |
| 143 | |
| 144 | struct Flags { |
| 145 | mut: |
| 146 | ignore_case bool |
| 147 | multiline bool |
| 148 | dot_all bool |
| 149 | } |
| 150 | |
| 151 | enum NodeType { |
| 152 | chr |
| 153 | any_char |
| 154 | group |
| 155 | alternation |
| 156 | char_class |
| 157 | start_of_string |
| 158 | end_of_string |
| 159 | word_boundary |
| 160 | non_word_boundary |
| 161 | word_char |
| 162 | non_word_char |
| 163 | digit |
| 164 | non_digit |
| 165 | whitespace |
| 166 | non_whitespace |
| 167 | lowercase_char |
| 168 | uppercase_char |
| 169 | } |
| 170 | |
| 171 | struct Node { |
| 172 | mut: |
| 173 | typ NodeType |
| 174 | chr rune |
| 175 | quant Quantifier |
| 176 | nodes []Node |
| 177 | alternatives [][]Node |
| 178 | group_capture_index int = -1 |
| 179 | char_set []rune |
| 180 | inverted bool |
| 181 | ignore_case bool |
| 182 | multiline bool |
| 183 | dot_all bool |
| 184 | } |
| 185 | |
| 186 | /****************************************************************************** |
| 187 | * |
| 188 | * Internal Utilities (Inlined for Speed) |
| 189 | * |
| 190 | ******************************************************************************/ |
| 191 | |
| 192 | // read_rune_at decodes a UTF-8 rune from a byte pointer safely. |
| 193 | // Marked inline to be embedded directly into the VM loop. |
| 194 | @[inline] |
| 195 | fn read_rune_at(str &u8, len int, index int) (rune, int) { |
| 196 | unsafe { |
| 197 | if index >= len { |
| 198 | return 0, 0 |
| 199 | } |
| 200 | b0 := u32(str[index]) |
| 201 | if b0 < 0x80 { |
| 202 | return rune(b0), 1 |
| 203 | } |
| 204 | if (b0 & 0xE0) == 0xC0 && index + 1 < len { |
| 205 | return rune(((b0 & 0x1F) << 6) | (u32(str[index + 1]) & 0x3F)), 2 |
| 206 | } |
| 207 | if (b0 & 0xF0) == 0xE0 && index + 2 < len { |
| 208 | return rune(((b0 & 0x0F) << 12) | ((u32(str[index + 1]) & 0x3F) << 6) | (u32(str[ |
| 209 | index + 2]) & 0x3F)), 3 |
| 210 | } |
| 211 | if (b0 & 0xF8) == 0xF0 && index + 3 < len { |
| 212 | return rune(((b0 & 0x07) << 18) | ((u32(str[index + 1]) & 0x3F) << 12) | ((u32(str[ |
| 213 | index + 2]) & 0x3F) << 6) | (u32(str[index + 3]) & 0x3F)), 4 |
| 214 | } |
| 215 | } |
| 216 | return 0, 0 |
| 217 | } |
| 218 | |
| 219 | // is_word_char returns true for alphanumeric ASCII characters and underscore. |
| 220 | @[inline] |
| 221 | fn is_word_char(r u8) bool { |
| 222 | return (r >= `a` && r <= `z`) || (r >= `A` && r <= `Z`) || (r >= `0` && r <= `9`) || r == `_` |
| 223 | } |
| 224 | |
| 225 | // set_bitmap sets a specific bit in a 128-bit bitset for ASCII matching. |
| 226 | @[inline] |
| 227 | fn set_bitmap(mut bitmap [4]u32, r rune) { |
| 228 | if r >= 0 && r < 128 { |
| 229 | idx := u32(r) >> 5 |
| 230 | bit := u32(r) & 31 |
| 231 | bitmap[idx] |= (u32(1) << bit) |
| 232 | } |
| 233 | } |
| 234 | |
| 235 | /****************************************************************************** |
| 236 | * |
| 237 | * Compiler Logic |
| 238 | * |
| 239 | ******************************************************************************/ |
| 240 | |
| 241 | // compile transforms a regex pattern string into a Regex object. |
| 242 | pub fn compile(pattern string) !Regex { |
| 243 | mut group_map := map[string]int{} |
| 244 | initial_flags := Flags{false, false, false} |
| 245 | |
| 246 | // Phase 1: AST Parsing |
| 247 | nodes, _, final_group_count := parse_nodes(pattern, 0, `\0`, 0, initial_flags, mut group_map)! |
| 248 | |
| 249 | root := Node{ |
| 250 | typ: .group |
| 251 | nodes: nodes |
| 252 | quant: Quantifier{1, 1, true} |
| 253 | } |
| 254 | |
| 255 | // Phase 2: Bytecode Emission |
| 256 | mut compiler := Compiler{ |
| 257 | prog: []Inst{cap: 128} |
| 258 | } |
| 259 | compiler.emit_node(root) |
| 260 | compiler.emit(Inst{ typ: .match }) |
| 261 | |
| 262 | // Phase 3: Optimization |
| 263 | optimized_prog := compiler.optimize() |
| 264 | |
| 265 | // Detect Prefix and Anchor optimizations |
| 266 | mut prefix := '' |
| 267 | mut has_prefix := false |
| 268 | mut anchored := false |
| 269 | |
| 270 | if optimized_prog.len > 0 { |
| 271 | first := optimized_prog[0] |
| 272 | if first.typ == .string { |
| 273 | prefix = first.val_str |
| 274 | has_prefix = true |
| 275 | } else if first.typ == .char && !first.ignore_case && first.val < 128 { |
| 276 | prefix = unsafe { u8(first.val).ascii_str() } |
| 277 | has_prefix = true |
| 278 | } else if first.typ == .assert_start { |
| 279 | anchored = true |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | return Regex{ |
| 284 | max_stack_depth: 2048 |
| 285 | pattern: pattern |
| 286 | prog: optimized_prog |
| 287 | total_groups: final_group_count |
| 288 | group_map: group_map |
| 289 | prefix_lit: prefix |
| 290 | has_prefix: has_prefix |
| 291 | anchored: anchored |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | // new_machine allocates a new VM state machine. |
| 296 | // This isolates the runtime memory (stack/captures) from the compiled regex, allowing thread-safe usage. |
| 297 | pub fn (r &Regex) new_machine() Machine { |
| 298 | // Pre-allocate enough space for stack and captures to avoid re-allocation in hot path |
| 299 | return Machine{ |
| 300 | stack: []int{len: r.max_stack_depth} |
| 301 | captures: []int{len: r.total_groups * 2} |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | // Compiler holds the state for generating the bytecode instructions. |
| 306 | struct Compiler { |
| 307 | mut: |
| 308 | prog []Inst |
| 309 | } |
| 310 | |
| 311 | // emit appends an instruction to the program and returns its index. |
| 312 | fn (mut c Compiler) emit(i Inst) int { |
| 313 | c.prog << i |
| 314 | return c.prog.len - 1 |
| 315 | } |
| 316 | |
| 317 | // optimize merges consecutive literal characters into single string instructions |
| 318 | // and resolves jump targets to absolute indices. |
| 319 | fn (mut c Compiler) optimize() []Inst { |
| 320 | mut targets := map[int]bool{} |
| 321 | for inst in c.prog { |
| 322 | if inst.typ == .split || inst.typ == .jmp { |
| 323 | targets[inst.target_x] = true |
| 324 | targets[inst.target_y] = true |
| 325 | } |
| 326 | } |
| 327 | |
| 328 | mut new_prog := []Inst{cap: c.prog.len} |
| 329 | mut idx_map := map[int]int{} |
| 330 | |
| 331 | mut i := 0 |
| 332 | for i < c.prog.len { |
| 333 | inst := c.prog[i] |
| 334 | idx_map[i] = new_prog.len |
| 335 | |
| 336 | // Optimization: Merge consecutive chars |
| 337 | if inst.typ == .char && !inst.ignore_case && inst.val < 128 { |
| 338 | mut s_val := unsafe { u8(inst.val).ascii_str() } |
| 339 | mut j := i + 1 |
| 340 | for j < c.prog.len && !targets[j] { |
| 341 | next := c.prog[j] |
| 342 | if next.typ == .char && !next.ignore_case && next.val < 128 { |
| 343 | s_val += unsafe { u8(next.val).ascii_str() } |
| 344 | j++ |
| 345 | } else { |
| 346 | break |
| 347 | } |
| 348 | } |
| 349 | if j > i + 1 { |
| 350 | new_prog << Inst{ |
| 351 | typ: .string |
| 352 | val_str: s_val |
| 353 | val_len: s_val.len |
| 354 | } |
| 355 | for k := i + 1; k < j; k++ { |
| 356 | idx_map[k] = new_prog.len |
| 357 | } |
| 358 | i = j |
| 359 | continue |
| 360 | } |
| 361 | } |
| 362 | new_prog << inst |
| 363 | i++ |
| 364 | } |
| 365 | idx_map[c.prog.len] = new_prog.len |
| 366 | |
| 367 | // Fix jump offsets |
| 368 | for mut inst in new_prog { |
| 369 | if inst.typ == .split || inst.typ == .jmp { |
| 370 | inst.target_x = idx_map[inst.target_x] or { inst.target_x } |
| 371 | inst.target_y = idx_map[inst.target_y] or { inst.target_y } |
| 372 | } |
| 373 | } |
| 374 | return new_prog |
| 375 | } |
| 376 | |
| 377 | // emit_class generates the instructions for a character class node. |
| 378 | // It populates the bitmap for O(1) ASCII matching and the slice for Unicode. |
| 379 | fn (mut c Compiler) emit_class(node Node) { |
| 380 | mut bitmap := [4]u32{} |
| 381 | mut char_class := node.char_set.clone() |
| 382 | mut inverted := node.inverted |
| 383 | |
| 384 | // Pre-compile common classes into the bitmap for O(1) lookups |
| 385 | match node.typ { |
| 386 | .word_char { |
| 387 | for r := `0`; r <= `9`; r++ { |
| 388 | set_bitmap(mut bitmap, r) |
| 389 | } |
| 390 | for r := `a`; r <= `z`; r++ { |
| 391 | set_bitmap(mut bitmap, r) |
| 392 | } |
| 393 | for r := `A`; r <= `Z`; r++ { |
| 394 | set_bitmap(mut bitmap, r) |
| 395 | } |
| 396 | set_bitmap(mut bitmap, `_`) |
| 397 | } |
| 398 | .non_word_char { |
| 399 | inverted = true |
| 400 | for r := `0`; r <= `9`; r++ { |
| 401 | set_bitmap(mut bitmap, r) |
| 402 | } |
| 403 | for r := `a`; r <= `z`; r++ { |
| 404 | set_bitmap(mut bitmap, r) |
| 405 | } |
| 406 | for r := `A`; r <= `Z`; r++ { |
| 407 | set_bitmap(mut bitmap, r) |
| 408 | } |
| 409 | set_bitmap(mut bitmap, `_`) |
| 410 | } |
| 411 | .digit { |
| 412 | for r := `0`; r <= `9`; r++ { |
| 413 | set_bitmap(mut bitmap, r) |
| 414 | } |
| 415 | } |
| 416 | .non_digit { |
| 417 | inverted = true |
| 418 | for r := `0`; r <= `9`; r++ { |
| 419 | set_bitmap(mut bitmap, r) |
| 420 | } |
| 421 | } |
| 422 | .whitespace { |
| 423 | for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] { |
| 424 | set_bitmap(mut bitmap, r) |
| 425 | } |
| 426 | } |
| 427 | .non_whitespace { |
| 428 | inverted = true |
| 429 | for r in [` `, `\t`, `\n`, `\r`, `\v`, `\f`] { |
| 430 | set_bitmap(mut bitmap, r) |
| 431 | } |
| 432 | } |
| 433 | .lowercase_char { |
| 434 | for r := `a`; r <= `z`; r++ { |
| 435 | set_bitmap(mut bitmap, r) |
| 436 | } |
| 437 | } |
| 438 | .uppercase_char { |
| 439 | for r := `A`; r <= `Z`; r++ { |
| 440 | set_bitmap(mut bitmap, r) |
| 441 | } |
| 442 | } |
| 443 | .char_class { |
| 444 | for r in node.char_set { |
| 445 | set_bitmap(mut bitmap, r) |
| 446 | if node.ignore_case { |
| 447 | if r >= `a` && r <= `z` { |
| 448 | set_bitmap(mut bitmap, r - 32) |
| 449 | } else if r >= `A` && r <= `Z` { |
| 450 | set_bitmap(mut bitmap, r + 32) |
| 451 | } |
| 452 | } |
| 453 | } |
| 454 | } |
| 455 | else {} |
| 456 | } |
| 457 | |
| 458 | c.emit(Inst{ |
| 459 | typ: .class |
| 460 | char_class: char_class |
| 461 | bitmap: bitmap |
| 462 | inverted: inverted |
| 463 | ignore_case: node.ignore_case |
| 464 | }) |
| 465 | } |
| 466 | |
| 467 | // emit_node handles quantifiers and loops, delegating the actual logic to emit_logic. |
| 468 | fn (mut c Compiler) emit_node(node Node) { |
| 469 | for _ in 0 .. node.quant.min { |
| 470 | c.emit_logic(node) |
| 471 | } |
| 472 | |
| 473 | if node.quant.max == -1 { |
| 474 | split_idx := c.emit(Inst{ typ: .split }) |
| 475 | start_pc := c.prog.len |
| 476 | c.emit_logic(node) |
| 477 | c.emit(Inst{ typ: .jmp, target_x: split_idx }) |
| 478 | if node.quant.greedy { |
| 479 | c.prog[split_idx].target_x = start_pc |
| 480 | c.prog[split_idx].target_y = c.prog.len |
| 481 | } else { |
| 482 | c.prog[split_idx].target_x = c.prog.len |
| 483 | c.prog[split_idx].target_y = start_pc |
| 484 | } |
| 485 | } else if node.quant.max > node.quant.min { |
| 486 | rem := node.quant.max - node.quant.min |
| 487 | mut splits := []int{} |
| 488 | for _ in 0 .. rem { |
| 489 | idx := c.emit(Inst{ typ: .split }) |
| 490 | match_pc := c.prog.len |
| 491 | if node.quant.greedy { |
| 492 | c.prog[idx].target_x = match_pc |
| 493 | } else { |
| 494 | c.prog[idx].target_y = match_pc |
| 495 | } |
| 496 | c.emit_logic(node) |
| 497 | splits << idx |
| 498 | } |
| 499 | end_pc := c.prog.len |
| 500 | for idx in splits { |
| 501 | if node.quant.greedy { |
| 502 | c.prog[idx].target_y = end_pc |
| 503 | } else { |
| 504 | c.prog[idx].target_x = end_pc |
| 505 | } |
| 506 | } |
| 507 | } |
| 508 | } |
| 509 | |
| 510 | // emit_logic generates instructions for specific node types (char, group, alternation). |
| 511 | fn (mut c Compiler) emit_logic(node Node) { |
| 512 | match node.typ { |
| 513 | .chr { |
| 514 | c.emit(Inst{ typ: .char, val: node.chr, ignore_case: node.ignore_case }) |
| 515 | } |
| 516 | .any_char { |
| 517 | c.emit(Inst{ typ: .any, dot_all: node.dot_all }) |
| 518 | } |
| 519 | .group { |
| 520 | if node.group_capture_index != -1 { |
| 521 | c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 }) |
| 522 | } |
| 523 | for child in node.nodes { |
| 524 | c.emit_node(child) |
| 525 | } |
| 526 | if node.group_capture_index != -1 { |
| 527 | c.emit(Inst{ typ: .save, group_idx: node.group_capture_index * 2 + 1 }) |
| 528 | } |
| 529 | } |
| 530 | .alternation { |
| 531 | if node.alternatives.len == 0 { |
| 532 | return |
| 533 | } |
| 534 | mut end_jumps := []int{} |
| 535 | for i := 0; i < node.alternatives.len - 1; i++ { |
| 536 | split_idx := c.emit(Inst{ typ: .split }) |
| 537 | c.prog[split_idx].target_x = c.prog.len |
| 538 | for child in node.alternatives[i] { |
| 539 | c.emit_node(child) |
| 540 | } |
| 541 | end_jumps << c.emit(Inst{ typ: .jmp }) |
| 542 | c.prog[split_idx].target_y = c.prog.len |
| 543 | } |
| 544 | for child in node.alternatives.last() { |
| 545 | c.emit_node(child) |
| 546 | } |
| 547 | for idx in end_jumps { |
| 548 | c.prog[idx].target_x = c.prog.len |
| 549 | } |
| 550 | } |
| 551 | .start_of_string { |
| 552 | c.emit(Inst{ |
| 553 | typ: if node.multiline { InstType.assert_line_start } else { InstType.assert_start } |
| 554 | }) |
| 555 | } |
| 556 | .end_of_string { |
| 557 | c.emit(Inst{ |
| 558 | typ: if node.multiline { InstType.assert_line_end } else { InstType.assert_end } |
| 559 | }) |
| 560 | } |
| 561 | .word_boundary { |
| 562 | c.emit(Inst{ typ: .assert_bound }) |
| 563 | } |
| 564 | .non_word_boundary { |
| 565 | c.emit(Inst{ typ: .assert_nbound }) |
| 566 | } |
| 567 | else { |
| 568 | c.emit_class(node) |
| 569 | } |
| 570 | } |
| 571 | } |
| 572 | |
| 573 | // parse_nodes implements a recursive descent parser to construct the AST from the pattern string. |
| 574 | fn parse_nodes(pattern string, pos_start int, terminator rune, group_counter_start int, passed_flags Flags, mut group_map map[string]int) !([]Node, int, int) { |
| 575 | mut pos := pos_start |
| 576 | mut group_counter := group_counter_start |
| 577 | mut current_flags := passed_flags |
| 578 | mut alternatives := [][]Node{} |
| 579 | mut current_sequence := []Node{} |
| 580 | |
| 581 | for pos < pattern.len { |
| 582 | chr, char_len := read_rune_at(pattern.str, pattern.len, pos) |
| 583 | if chr == terminator { |
| 584 | pos += char_len |
| 585 | if alternatives.len > 0 { |
| 586 | alternatives << current_sequence |
| 587 | return [ |
| 588 | Node{ |
| 589 | typ: .alternation |
| 590 | alternatives: alternatives |
| 591 | quant: Quantifier{1, 1, true} |
| 592 | }, |
| 593 | ], pos, group_counter |
| 594 | } |
| 595 | return current_sequence, pos, group_counter |
| 596 | } |
| 597 | if chr == `|` { |
| 598 | pos += char_len |
| 599 | if current_sequence.len == 0 { |
| 600 | return error('Empty alternative') |
| 601 | } |
| 602 | alternatives << current_sequence |
| 603 | current_sequence = []Node{} |
| 604 | continue |
| 605 | } |
| 606 | |
| 607 | mut parsed_nodes := []Node{} |
| 608 | match chr { |
| 609 | `^` { |
| 610 | pos += char_len |
| 611 | parsed_nodes << Node{ |
| 612 | typ: .start_of_string |
| 613 | multiline: current_flags.multiline |
| 614 | } |
| 615 | } |
| 616 | `$` { |
| 617 | pos += char_len |
| 618 | parsed_nodes << Node{ |
| 619 | typ: .end_of_string |
| 620 | multiline: current_flags.multiline |
| 621 | } |
| 622 | } |
| 623 | `.` { |
| 624 | pos += char_len |
| 625 | parsed_nodes << Node{ |
| 626 | typ: .any_char |
| 627 | dot_all: current_flags.dot_all |
| 628 | } |
| 629 | } |
| 630 | `*`, `+`, `?`, `{` { |
| 631 | return error('Quantifier must follow a token') |
| 632 | } |
| 633 | `(` { |
| 634 | pos += char_len |
| 635 | mut idx := -1 |
| 636 | mut cap := true |
| 637 | if pos < pattern.len && pattern[pos] == `?` { |
| 638 | pos++ |
| 639 | if pos < pattern.len && pattern[pos] in [`i`, `m`, `s`] { |
| 640 | for pos < pattern.len && pattern[pos] != `)` { |
| 641 | match pattern[pos] { |
| 642 | `i` { current_flags.ignore_case = true } |
| 643 | `m` { current_flags.multiline = true } |
| 644 | `s` { current_flags.dot_all = true } |
| 645 | else {} |
| 646 | } |
| 647 | |
| 648 | pos++ |
| 649 | } |
| 650 | if pos < pattern.len { |
| 651 | pos++ |
| 652 | } |
| 653 | continue |
| 654 | } else if pos < pattern.len && pattern[pos] == `:` { |
| 655 | cap = false |
| 656 | pos++ |
| 657 | } else if pos < pattern.len && pattern[pos] == `P` { |
| 658 | pos++ |
| 659 | if pos >= pattern.len || pattern[pos] != `<` { |
| 660 | return error('Invalid named group syntax') |
| 661 | } |
| 662 | pos++ |
| 663 | end := pattern.index_after('>', pos) or { -1 } |
| 664 | if end == -1 { |
| 665 | return error('Unclosed named group') |
| 666 | } |
| 667 | name := pattern[pos..end] |
| 668 | if name in group_map { |
| 669 | return error('Duplicate named group: ${name}') |
| 670 | } |
| 671 | idx = group_counter |
| 672 | group_map[name] = idx |
| 673 | pos = end + 1 |
| 674 | } |
| 675 | } |
| 676 | if cap { |
| 677 | if idx == -1 { |
| 678 | idx = group_counter |
| 679 | } |
| 680 | group_counter++ |
| 681 | } |
| 682 | sub, new_p, new_c := parse_nodes(pattern, pos, `)`, group_counter, current_flags, mut |
| 683 | group_map)! |
| 684 | pos = new_p |
| 685 | group_counter = new_c |
| 686 | parsed_nodes << Node{ |
| 687 | typ: .group |
| 688 | nodes: sub |
| 689 | group_capture_index: idx |
| 690 | } |
| 691 | } |
| 692 | `[` { |
| 693 | pos += char_len |
| 694 | end := pattern.index_after(']', pos) or { -1 } |
| 695 | if end == -1 { |
| 696 | return error('Unclosed character class') |
| 697 | } |
| 698 | content := pattern[pos..end] |
| 699 | pos = end + 1 |
| 700 | inv := content.len > 0 && content[0] == `^` |
| 701 | mut set := []rune{} |
| 702 | mut i := if inv { 1 } else { 0 } |
| 703 | for i < content.len { |
| 704 | c, l := read_rune_at(content.str, content.len, i) |
| 705 | if i + l + 1 < content.len && content[i + l] == `-` { |
| 706 | ec, el := read_rune_at(content.str, content.len, i + l + 1) |
| 707 | for r := c; r <= ec; r++ { |
| 708 | set << r |
| 709 | } |
| 710 | i += l + 1 + el |
| 711 | } else { |
| 712 | set << c |
| 713 | i += l |
| 714 | } |
| 715 | } |
| 716 | parsed_nodes << Node{ |
| 717 | typ: .char_class |
| 718 | char_set: set |
| 719 | inverted: inv |
| 720 | ignore_case: current_flags.ignore_case |
| 721 | } |
| 722 | } |
| 723 | `\\` { |
| 724 | pos += char_len |
| 725 | if pos >= pattern.len { |
| 726 | return error('Trailing backslash') |
| 727 | } |
| 728 | esc, el := read_rune_at(pattern.str, pattern.len, pos) |
| 729 | pos += el |
| 730 | match esc { |
| 731 | `w` { |
| 732 | parsed_nodes << Node{ |
| 733 | typ: .word_char |
| 734 | } |
| 735 | } |
| 736 | `W` { |
| 737 | parsed_nodes << Node{ |
| 738 | typ: .non_word_char |
| 739 | } |
| 740 | } |
| 741 | `d` { |
| 742 | parsed_nodes << Node{ |
| 743 | typ: .digit |
| 744 | } |
| 745 | } |
| 746 | `D` { |
| 747 | parsed_nodes << Node{ |
| 748 | typ: .non_digit |
| 749 | } |
| 750 | } |
| 751 | `s` { |
| 752 | parsed_nodes << Node{ |
| 753 | typ: .whitespace |
| 754 | } |
| 755 | } |
| 756 | `S` { |
| 757 | parsed_nodes << Node{ |
| 758 | typ: .non_whitespace |
| 759 | } |
| 760 | } |
| 761 | `b` { |
| 762 | parsed_nodes << Node{ |
| 763 | typ: .word_boundary |
| 764 | } |
| 765 | } |
| 766 | `B` { |
| 767 | parsed_nodes << Node{ |
| 768 | typ: .non_word_boundary |
| 769 | } |
| 770 | } |
| 771 | `a` { |
| 772 | parsed_nodes << Node{ |
| 773 | typ: .lowercase_char |
| 774 | } |
| 775 | } |
| 776 | `A` { |
| 777 | parsed_nodes << Node{ |
| 778 | typ: .uppercase_char |
| 779 | } |
| 780 | } |
| 781 | `x` { |
| 782 | // \xHH - two hex digits decode to a character |
| 783 | if pos + 2 > pattern.len { |
| 784 | return error('\\x requires exactly 2 hex digits') |
| 785 | } |
| 786 | hex_str := pattern[pos..pos + 2] |
| 787 | val := strconv.parse_uint(hex_str, 16, 32) or { |
| 788 | return error('Invalid hex escape \\x${hex_str}') |
| 789 | } |
| 790 | pos += 2 |
| 791 | parsed_nodes << Node{ |
| 792 | typ: .chr |
| 793 | chr: rune(val) |
| 794 | ignore_case: current_flags.ignore_case |
| 795 | } |
| 796 | } |
| 797 | `X` { |
| 798 | // \XHHHH - four hex digits decode to a Unicode codepoint |
| 799 | if pos + 4 > pattern.len { |
| 800 | return error('\\X requires exactly 4 hex digits') |
| 801 | } |
| 802 | hex_str := pattern[pos..pos + 4] |
| 803 | val := strconv.parse_uint(hex_str, 16, 32) or { |
| 804 | return error('Invalid hex escape \\X${hex_str}') |
| 805 | } |
| 806 | pos += 4 |
| 807 | parsed_nodes << Node{ |
| 808 | typ: .chr |
| 809 | chr: rune(val) |
| 810 | ignore_case: current_flags.ignore_case |
| 811 | } |
| 812 | } |
| 813 | else { |
| 814 | parsed_nodes << Node{ |
| 815 | typ: .chr |
| 816 | chr: esc |
| 817 | ignore_case: current_flags.ignore_case |
| 818 | } |
| 819 | } |
| 820 | } |
| 821 | } |
| 822 | else { |
| 823 | pos += char_len |
| 824 | parsed_nodes << Node{ |
| 825 | typ: .chr |
| 826 | chr: chr |
| 827 | ignore_case: current_flags.ignore_case |
| 828 | } |
| 829 | } |
| 830 | } |
| 831 | |
| 832 | if parsed_nodes.len > 0 { |
| 833 | mut q := Quantifier{1, 1, true} |
| 834 | if pos < pattern.len { |
| 835 | peek := pattern[pos] |
| 836 | match peek { |
| 837 | `*` { |
| 838 | q = Quantifier{0, -1, true} |
| 839 | pos++ |
| 840 | } |
| 841 | `+` { |
| 842 | q = Quantifier{1, -1, true} |
| 843 | pos++ |
| 844 | } |
| 845 | `?` { |
| 846 | q = Quantifier{0, 1, true} |
| 847 | pos++ |
| 848 | } |
| 849 | `{` { |
| 850 | end := pattern.index_after('}', pos) or { -1 } |
| 851 | if end == -1 { |
| 852 | return error('Unclosed quantifier') |
| 853 | } |
| 854 | parts := pattern[pos + 1..end].split(',') |
| 855 | min := parts[0].int() |
| 856 | max := if parts.len > 1 { |
| 857 | if parts[1] == '' { -1 } else { parts[1].int() } |
| 858 | } else { |
| 859 | min |
| 860 | } |
| 861 | if min < 0 || (max != -1 && max < min) { |
| 862 | return error('Invalid quantifier range {${min},${max}}') |
| 863 | } |
| 864 | q = Quantifier{min, max, true} |
| 865 | pos = end + 1 |
| 866 | } |
| 867 | else {} |
| 868 | } |
| 869 | |
| 870 | if pos < pattern.len && pattern[pos] == `?` { |
| 871 | q.greedy = false |
| 872 | pos++ |
| 873 | } |
| 874 | } |
| 875 | parsed_nodes.last().quant = q |
| 876 | } |
| 877 | current_sequence << parsed_nodes |
| 878 | } |
| 879 | |
| 880 | if alternatives.len > 0 { |
| 881 | if current_sequence.len == 0 { |
| 882 | return error('Empty alternative at end of pattern') |
| 883 | } |
| 884 | alternatives << current_sequence |
| 885 | return [ |
| 886 | Node{ |
| 887 | typ: .alternation |
| 888 | alternatives: alternatives |
| 889 | quant: Quantifier{1, 1, true} |
| 890 | }, |
| 891 | ], pos, group_counter |
| 892 | } |
| 893 | return current_sequence, pos, group_counter |
| 894 | } |
| 895 | |
| 896 | /****************************************************************************** |
| 897 | * |
| 898 | * Virtual Machine Execution Engine (Highly Optimized) |
| 899 | * |
| 900 | ******************************************************************************/ |
| 901 | |
| 902 | // vm_match executes the bytecode against the input string using the provided Machine state. |
| 903 | // OPTIMIZATION: Uses raw pointers for instruction and stack access to bypass bounds checking. |
| 904 | @[direct_array_access] |
| 905 | fn (r &Regex) vm_match(text string, start_pos int, mut m Machine) ?Match { |
| 906 | unsafe { |
| 907 | // Optimization: Cast voidptr to typed pointer for direct indexing |
| 908 | mut cap_ptr := &int(m.captures.data) |
| 909 | cap_len := m.captures.len |
| 910 | |
| 911 | // Fast clear of captures using pointer arithmetic (memset-like) |
| 912 | for i := 0; i < cap_len; i++ { |
| 913 | cap_ptr[i] = -1 |
| 914 | } |
| 915 | |
| 916 | mut sp := start_pos // String Pointer Index |
| 917 | mut stack_ptr := 0 // Stack Pointer Offset |
| 918 | |
| 919 | cap_size := r.total_groups * 2 |
| 920 | frame_size := cap_size + 2 // [captures..., saved_sp, saved_pc] |
| 921 | |
| 922 | // Raw pointers for hot path access |
| 923 | prog_start := &Inst(r.prog.data) |
| 924 | mut inst_ptr := prog_start // PC as a pointer |
| 925 | |
| 926 | str_ptr := text.str |
| 927 | str_len := text.len |
| 928 | |
| 929 | // Cache stack data pointer (cast to typed pointer) |
| 930 | mut stack_data := &int(m.stack.data) |
| 931 | mut stack_max := m.stack.len |
| 932 | |
| 933 | for { |
| 934 | // Check if we walked off the program (should be caught by match inst) |
| 935 | // Using pointer arithmetic: offset = (inst_ptr - prog_start) |
| 936 | |
| 937 | match inst_ptr.typ { |
| 938 | .match { |
| 939 | // Only allocate result strings on successful match |
| 940 | mut s_groups := []string{cap: r.total_groups} |
| 941 | for i := 0; i < r.total_groups; i++ { |
| 942 | s, e := cap_ptr[i * 2], cap_ptr[i * 2 + 1] |
| 943 | s_groups << if s != -1 && e >= s { text[s..e] } else { '' } |
| 944 | } |
| 945 | return Match{ |
| 946 | text: text[start_pos..sp] |
| 947 | start: start_pos |
| 948 | end: sp |
| 949 | groups: s_groups |
| 950 | } |
| 951 | } |
| 952 | .char { |
| 953 | if sp >= str_len { |
| 954 | goto backtrack |
| 955 | } |
| 956 | curr_byte := str_ptr[sp] |
| 957 | |
| 958 | // Fast ASCII path |
| 959 | if curr_byte < 128 && inst_ptr.val < 128 { |
| 960 | mut c1, mut c2 := curr_byte, u8(inst_ptr.val) |
| 961 | if inst_ptr.ignore_case { |
| 962 | if c1 >= `a` && c1 <= `z` { |
| 963 | c1 -= 32 |
| 964 | } |
| 965 | if c2 >= `a` && c2 <= `z` { |
| 966 | c2 -= 32 |
| 967 | } |
| 968 | } |
| 969 | if c1 == c2 { |
| 970 | sp++ |
| 971 | inst_ptr++ |
| 972 | continue |
| 973 | } |
| 974 | goto backtrack |
| 975 | } |
| 976 | |
| 977 | // UTF-8 Path |
| 978 | rn, l := read_rune_at(str_ptr, str_len, sp) |
| 979 | if l == 0 { |
| 980 | goto backtrack |
| 981 | } |
| 982 | |
| 983 | mut match_ok := false |
| 984 | if inst_ptr.ignore_case { |
| 985 | r1 := if rn >= `a` && rn <= `z` { rn - 32 } else { rn } |
| 986 | r2 := if inst_ptr.val >= `a` && inst_ptr.val <= `z` { |
| 987 | inst_ptr.val - 32 |
| 988 | } else { |
| 989 | inst_ptr.val |
| 990 | } |
| 991 | if r1 == r2 { |
| 992 | match_ok = true |
| 993 | } |
| 994 | } else if rn == inst_ptr.val { |
| 995 | match_ok = true |
| 996 | } |
| 997 | |
| 998 | if match_ok { |
| 999 | sp += l |
| 1000 | inst_ptr++ |
| 1001 | } else { |
| 1002 | goto backtrack |
| 1003 | } |
| 1004 | } |
| 1005 | .string { |
| 1006 | if sp + inst_ptr.val_len > str_len { |
| 1007 | goto backtrack |
| 1008 | } |
| 1009 | // Inline memcmp |
| 1010 | v_str := inst_ptr.val_str.str |
| 1011 | for i in 0 .. inst_ptr.val_len { |
| 1012 | if str_ptr[sp + i] != v_str[i] { |
| 1013 | goto backtrack |
| 1014 | } |
| 1015 | } |
| 1016 | sp += inst_ptr.val_len |
| 1017 | inst_ptr++ |
| 1018 | } |
| 1019 | .class { |
| 1020 | if sp >= str_len { |
| 1021 | goto backtrack |
| 1022 | } |
| 1023 | c_byte := str_ptr[sp] |
| 1024 | mut matched := false |
| 1025 | mut cl := 1 |
| 1026 | |
| 1027 | // Optimization: Bitmap lookup for ASCII |
| 1028 | if c_byte < 128 { |
| 1029 | if (inst_ptr.bitmap[c_byte >> 5] & (u32(1) << (c_byte & 31))) != 0 { |
| 1030 | matched = true |
| 1031 | } |
| 1032 | } else { |
| 1033 | rn, l := read_rune_at(str_ptr, str_len, sp) |
| 1034 | cl = l |
| 1035 | for cr in inst_ptr.char_class { |
| 1036 | if cr == rn { |
| 1037 | matched = true |
| 1038 | break |
| 1039 | } |
| 1040 | } |
| 1041 | } |
| 1042 | |
| 1043 | if matched != inst_ptr.inverted { |
| 1044 | sp += cl |
| 1045 | inst_ptr++ |
| 1046 | } else { |
| 1047 | goto backtrack |
| 1048 | } |
| 1049 | } |
| 1050 | .any { |
| 1051 | if sp >= str_len { |
| 1052 | goto backtrack |
| 1053 | } |
| 1054 | if inst_ptr.dot_all || str_ptr[sp] != `\n` { |
| 1055 | _, cl := read_rune_at(str_ptr, str_len, sp) |
| 1056 | sp += cl |
| 1057 | inst_ptr++ |
| 1058 | } else { |
| 1059 | goto backtrack |
| 1060 | } |
| 1061 | } |
| 1062 | .save { |
| 1063 | cap_ptr[inst_ptr.group_idx] = sp |
| 1064 | inst_ptr++ |
| 1065 | } |
| 1066 | .split { |
| 1067 | if stack_ptr + frame_size >= stack_max { |
| 1068 | new_size := stack_max * 2 |
| 1069 | if new_size > r.max_stack_depth { |
| 1070 | goto backtrack |
| 1071 | } |
| 1072 | m.stack.grow_len(new_size) |
| 1073 | stack_data = &int(m.stack.data) // Pointer might change on realloc |
| 1074 | stack_max = new_size |
| 1075 | } |
| 1076 | |
| 1077 | // Optimization: Unrolled stack push |
| 1078 | stack_offset := stack_ptr |
| 1079 | for i in 0 .. cap_size { |
| 1080 | stack_data[stack_offset + i] = cap_ptr[i] |
| 1081 | } |
| 1082 | stack_data[stack_offset + cap_size] = sp |
| 1083 | // Save backtrack target PC index |
| 1084 | stack_data[stack_offset + cap_size + 1] = inst_ptr.target_y |
| 1085 | |
| 1086 | stack_ptr += frame_size |
| 1087 | // Jump to primary target (convert index to pointer) |
| 1088 | // FIX: Use pointer indexing instead of addition to avoid type mismatch |
| 1089 | inst_ptr = &prog_start[inst_ptr.target_x] |
| 1090 | } |
| 1091 | .jmp { |
| 1092 | inst_ptr = &prog_start[inst_ptr.target_x] |
| 1093 | } |
| 1094 | .assert_start { |
| 1095 | if sp == 0 { |
| 1096 | inst_ptr++ |
| 1097 | } else { |
| 1098 | goto backtrack |
| 1099 | } |
| 1100 | } |
| 1101 | .assert_end { |
| 1102 | if sp == str_len { |
| 1103 | inst_ptr++ |
| 1104 | } else { |
| 1105 | goto backtrack |
| 1106 | } |
| 1107 | } |
| 1108 | .assert_line_start { |
| 1109 | if sp == 0 || (sp > 0 && str_ptr[sp - 1] == `\n`) { |
| 1110 | inst_ptr++ |
| 1111 | } else { |
| 1112 | goto backtrack |
| 1113 | } |
| 1114 | } |
| 1115 | .assert_line_end { |
| 1116 | if sp == str_len || str_ptr[sp] == `\n` { |
| 1117 | inst_ptr++ |
| 1118 | } else { |
| 1119 | goto backtrack |
| 1120 | } |
| 1121 | } |
| 1122 | .assert_bound, .assert_nbound { |
| 1123 | l := if sp > 0 { is_word_char(str_ptr[sp - 1]) } else { false } |
| 1124 | r_ := if sp < str_len { is_word_char(str_ptr[sp]) } else { false } |
| 1125 | if (inst_ptr.typ == .assert_bound && l != r_) |
| 1126 | || (inst_ptr.typ == .assert_nbound && l == r_) { |
| 1127 | inst_ptr++ |
| 1128 | } else { |
| 1129 | goto backtrack |
| 1130 | } |
| 1131 | } |
| 1132 | } |
| 1133 | |
| 1134 | continue |
| 1135 | |
| 1136 | backtrack: |
| 1137 | if stack_ptr <= 0 { |
| 1138 | return none |
| 1139 | } |
| 1140 | |
| 1141 | stack_ptr -= frame_size |
| 1142 | stack_offset := stack_ptr |
| 1143 | |
| 1144 | // Restore captures |
| 1145 | for i in 0 .. cap_size { |
| 1146 | cap_ptr[i] = stack_data[stack_offset + i] |
| 1147 | } |
| 1148 | sp = stack_data[stack_offset + cap_size] |
| 1149 | // Restore PC from index (explicit cast for pointer arithmetic) |
| 1150 | // FIX: Use pointer indexing instead of addition |
| 1151 | inst_ptr = &prog_start[stack_data[stack_offset + cap_size + 1]] |
| 1152 | } |
| 1153 | } |
| 1154 | return none |
| 1155 | } |
| 1156 | |
| 1157 | /****************************************************************************** |
| 1158 | * |
| 1159 | * Public API |
| 1160 | * |
| 1161 | ******************************************************************************/ |
| 1162 | |
| 1163 | // find returns the first match of the regex in text. |
| 1164 | pub fn (r &Regex) find(text string) ?Match { |
| 1165 | return r.find_from(text, 0) |
| 1166 | } |
| 1167 | |
| 1168 | // find_from returns the first match starting from start_index. |
| 1169 | // Optimized with fast prefix skipping and anchor checks. |
| 1170 | pub fn (r &Regex) find_from(text string, start_index int) ?Match { |
| 1171 | if start_index < 0 || start_index > text.len { |
| 1172 | return none |
| 1173 | } |
| 1174 | mut m := r.new_machine() |
| 1175 | |
| 1176 | // Optimization: Anchored pattern (^) only checks the start |
| 1177 | if r.anchored { |
| 1178 | if start_index == 0 { |
| 1179 | return r.vm_match(text, 0, mut m) |
| 1180 | } |
| 1181 | // If multiline mode is NOT enabled, ^ only matches index 0. |
| 1182 | // If multiline is enabled, we need to check every newline, handled by logic below. |
| 1183 | // Note: The compiler sets anchored=true only for ^ at start. |
| 1184 | // If multiline flag is set dynamically inside pattern, strict anchoring logic might differ, |
| 1185 | // but standard ^ usage benefits here. |
| 1186 | } |
| 1187 | |
| 1188 | // Optimization: Boyer-Moore-like literal prefix skip |
| 1189 | if r.has_prefix { |
| 1190 | mut i := text.index_after(r.prefix_lit, start_index) or { -1 } |
| 1191 | for i != -1 { |
| 1192 | if res := r.vm_match(text, i, mut m) { |
| 1193 | return res |
| 1194 | } |
| 1195 | i = text.index_after(r.prefix_lit, i + 1) or { -1 } |
| 1196 | } |
| 1197 | return none |
| 1198 | } |
| 1199 | |
| 1200 | for i := start_index; i <= text.len; i++ { |
| 1201 | // Skip UTF-8 continuation bytes to ensure we only match at rune boundaries |
| 1202 | // 0xC0 (11000000) is the start of a multi-byte sequence, 0x80 (10000000) is a continuation |
| 1203 | if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 { |
| 1204 | continue |
| 1205 | } |
| 1206 | if res := r.vm_match(text, i, mut m) { |
| 1207 | return res |
| 1208 | } |
| 1209 | } |
| 1210 | return none |
| 1211 | } |
| 1212 | |
| 1213 | // find_all returns all non-overlapping matches in text. |
| 1214 | pub fn (r &Regex) find_all(text string) []Match { |
| 1215 | mut matches := []Match{} |
| 1216 | mut m := r.new_machine() |
| 1217 | mut i := 0 |
| 1218 | for i <= text.len { |
| 1219 | if i > 0 && i < text.len && (text[i] & 0xC0) == 0x80 { |
| 1220 | i++ |
| 1221 | continue |
| 1222 | } |
| 1223 | if res := r.vm_match(text, i, mut m) { |
| 1224 | matches << res |
| 1225 | if res.end > i { |
| 1226 | i = res.end |
| 1227 | } else { |
| 1228 | // Empty match: advance by one full rune to avoid infinite loop |
| 1229 | _, rune_len := read_rune_at(text.str, text.len, i) |
| 1230 | i += if rune_len > 0 { rune_len } else { 1 } |
| 1231 | } |
| 1232 | } else { |
| 1233 | i++ |
| 1234 | } |
| 1235 | } |
| 1236 | return matches |
| 1237 | } |
| 1238 | |
| 1239 | // replace finds the first match and replaces it using repl. Supports $1, $2 backreferences. |
| 1240 | pub fn (r &Regex) replace(text string, repl string) string { |
| 1241 | res := r.find(text) or { return text } |
| 1242 | mut sb := strings.new_builder(text.len + repl.len) |
| 1243 | sb.write_string(text[0..res.start]) |
| 1244 | mut i := 0 |
| 1245 | for i < repl.len { |
| 1246 | if repl[i] == `$` && i + 1 < repl.len { |
| 1247 | next_char := repl[i + 1] |
| 1248 | if next_char >= `0` && next_char <= `9` { |
| 1249 | d := next_char - `0` - 1 |
| 1250 | if d >= 0 && d < res.groups.len { |
| 1251 | sb.write_string(res.groups[d]) |
| 1252 | } |
| 1253 | i += 2 |
| 1254 | continue |
| 1255 | } |
| 1256 | } |
| 1257 | sb.write_u8(repl[i]) |
| 1258 | i++ |
| 1259 | } |
| 1260 | sb.write_string(text[res.end..]) |
| 1261 | return sb.str() |
| 1262 | } |
| 1263 | |
| 1264 | // fullmatch returns a Match only if the pattern matches the entire text. |
| 1265 | pub fn (r &Regex) fullmatch(text string) ?Match { |
| 1266 | mut m := r.new_machine() |
| 1267 | if res := r.vm_match(text, 0, mut m) { |
| 1268 | if res.end == text.len { |
| 1269 | return res |
| 1270 | } |
| 1271 | } |
| 1272 | return none |
| 1273 | } |
| 1274 | |
| 1275 | // group_by_name retrieves text captured by a named group. |
| 1276 | pub fn (r &Regex) group_by_name(m Match, name string) string { |
| 1277 | idx := r.group_map[name] or { return '' } |
| 1278 | return if idx < m.groups.len { m.groups[idx] } else { '' } |
| 1279 | } |
| 1280 | |
| 1281 | /****************************************************************************** |
| 1282 | * |
| 1283 | * Compatibility Layer |
| 1284 | * |
| 1285 | ******************************************************************************/ |
| 1286 | |
| 1287 | // new_regex is a helper wrapper to compile a regex pattern. |
| 1288 | pub fn new_regex(pattern string, _ int) !Regex { |
| 1289 | return compile(pattern) |
| 1290 | } |
| 1291 | |
| 1292 | // match_str is a compatibility alias for find_from. |
| 1293 | pub fn (r &Regex) match_str(text string, start_index int, _ int) ?Match { |
| 1294 | return r.find_from(text, start_index) |
| 1295 | } |
| 1296 | |
| 1297 | // get returns the matched text for a specific group index. |
| 1298 | // Index 0 returns the full match, 1..n returns capture groups. |
| 1299 | pub fn (m Match) get(idx int) ?string { |
| 1300 | if idx == 0 { |
| 1301 | return m.text |
| 1302 | } |
| 1303 | if idx > 0 && idx <= m.groups.len { |
| 1304 | return m.groups[idx - 1] |
| 1305 | } |
| 1306 | return none |
| 1307 | } |
| 1308 | |
| 1309 | // get_all returns a list of all captured strings, starting with the full match at index 0. |
| 1310 | pub fn (m Match) get_all() []string { |
| 1311 | mut res := [m.text] |
| 1312 | res << m.groups |
| 1313 | return res |
| 1314 | } |
| 1315 | |