| 1 | /* |
| 2 | regex 1.0 alpha |
| 3 | |
| 4 | Copyright (c) 2019-2024 Dario Deledda. All rights reserved. |
| 5 | Use of this source code is governed by an MIT license |
| 6 | that can be found in the LICENSE file. |
| 7 | |
| 8 | This file contains regex module |
| 9 | |
| 10 | Know limitation: |
| 11 | - find is implemented in a trivial way |
| 12 | - not full compliant PCRE |
| 13 | - not compliant POSIX ERE |
| 14 | */ |
| 15 | module regex |
| 16 | |
| 17 | import strings |
| 18 | |
| 19 | pub const v_regex_version = '1.0 alpha' // regex module version |
| 20 | pub const max_code_len = 256 // default small base code len for the regex programs |
| 21 | pub const max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 |
| 22 | |
| 23 | // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode |
| 24 | pub const spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] |
| 25 | // new line chars for now only '\n' |
| 26 | pub const new_line_list = [`\n`, `\r`] |
| 27 | |
| 28 | // Results |
| 29 | pub const no_match_found = -1 |
| 30 | |
| 31 | // Errors |
| 32 | pub const compile_ok = 0 // the regex string compiled, all ok |
| 33 | pub const err_char_unknown = -2 // the char used is unknow to the system |
| 34 | pub const err_undefined = -3 // the compiler symbol is undefined |
| 35 | pub const err_internal_error = -4 // Bug in the regex system!! |
| 36 | pub const err_cc_alloc_overflow = -5 // memory for char class full!! |
| 37 | pub const err_syntax_error = -6 // syntax error in regex compiling |
| 38 | pub const err_groups_overflow = -7 // max number of groups reached |
| 39 | pub const err_groups_max_nested = -8 // max number of nested group reached |
| 40 | pub const err_group_not_balanced = -9 // group not balanced |
| 41 | pub const err_group_qm_notation = -10 // group invalid notation |
| 42 | pub const err_invalid_or_with_cc = -11 // invalid or on two consecutive char class |
| 43 | pub const err_neg_group_quantifier = -12 // negation groups can not have quantifier |
| 44 | pub const err_consecutive_dots = -13 |
| 45 | |
| 46 | //************************************* |
| 47 | // regex program instructions |
| 48 | //************************************* |
| 49 | const ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char |
| 50 | |
| 51 | // char class 11 0100 AA xxxxxxxx |
| 52 | // AA = 00 regular class |
| 53 | // AA = 01 Negated class ^ char |
| 54 | const ist_char_class = u32(0xD1000000) // MASK |
| 55 | const ist_char_class_pos = u32(0xD0000000) // char class normal [abc] |
| 56 | const ist_char_class_neg = u32(0xD1000000) // char class negate [^abc] |
| 57 | |
| 58 | // dot char 10 0110 xx xxxxxxxx |
| 59 | const ist_dot_char = u32(0x98000000) // match any char except \n |
| 60 | |
| 61 | // backslash chars 10 0100 xx xxxxxxxx |
| 62 | const ist_bsls_char = u32(0x90000000) // backslash char |
| 63 | |
| 64 | // OR | 10 010Y xx xxxxxxxx |
| 65 | const ist_or_branch = u32(0x91000000) // OR case |
| 66 | |
| 67 | // groups 10 010Y xx xxxxxxxx |
| 68 | const ist_group_start = u32(0x92000000) // group start ( |
| 69 | const ist_group_end = u32(0x94000000) // group end ) |
| 70 | |
| 71 | // control instructions |
| 72 | const ist_prog_end = u32(0x88000000) |
| 73 | |
| 74 | /* |
| 75 | General Utilities |
| 76 | */ |
| 77 | // utf8util_char_len calculate the length in bytes of a utf8 char |
| 78 | @[inline] |
| 79 | fn utf8util_char_len(b u8) int { |
| 80 | return int(((u32(0xe5000000) >> ((b >> 3) & 0x1e)) & 3) + 1) |
| 81 | } |
| 82 | |
| 83 | // get_char get a char from position i and return an u32 with the unicode code |
| 84 | @[direct_array_access; inline] |
| 85 | fn (re &RE) get_char(in_txt string, i int) (u32, int) { |
| 86 | ini := unsafe { in_txt.str[i] } |
| 87 | // ascii 8 bit |
| 88 | if (re.flag & f_bin) != 0 || ini & 0x80 == 0 { |
| 89 | return u32(ini), 1 |
| 90 | } |
| 91 | // unicode char |
| 92 | char_len := utf8util_char_len(ini) |
| 93 | mut tmp := 0 |
| 94 | mut ch := u32(0) |
| 95 | for tmp < char_len { |
| 96 | ch = (ch << 8) | unsafe { in_txt.str[i + tmp] } |
| 97 | tmp++ |
| 98 | } |
| 99 | return ch, char_len |
| 100 | } |
| 101 | |
| 102 | // get_charb get a char from position i and return an u32 with the unicode code |
| 103 | @[direct_array_access; inline] |
| 104 | fn (re &RE) get_charb(in_txt &u8, i int) (u32, int) { |
| 105 | // ascii 8 bit |
| 106 | if (re.flag & f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 { |
| 107 | return u32(unsafe { in_txt[i] }), 1 |
| 108 | } |
| 109 | // unicode char |
| 110 | char_len := utf8util_char_len(unsafe { in_txt[i] }) |
| 111 | mut tmp := 0 |
| 112 | mut ch := u32(0) |
| 113 | for tmp < char_len { |
| 114 | ch = (ch << 8) | unsafe { in_txt[i + tmp] } |
| 115 | tmp++ |
| 116 | } |
| 117 | return ch, char_len |
| 118 | } |
| 119 | |
| 120 | @[inline] |
| 121 | fn is_alnum(in_char u8) bool { |
| 122 | mut tmp := in_char - `A` |
| 123 | if tmp <= 25 { |
| 124 | return true |
| 125 | } |
| 126 | tmp = in_char - `a` |
| 127 | if tmp <= 25 { |
| 128 | return true |
| 129 | } |
| 130 | tmp = in_char - `0` |
| 131 | if tmp <= 9 { |
| 132 | return true |
| 133 | } |
| 134 | if in_char == `_` { |
| 135 | return true |
| 136 | } |
| 137 | return false |
| 138 | } |
| 139 | |
| 140 | @[inline] |
| 141 | fn is_not_alnum(in_char u8) bool { |
| 142 | return !is_alnum(in_char) |
| 143 | } |
| 144 | |
| 145 | @[inline] |
| 146 | fn is_space(in_char u8) bool { |
| 147 | return in_char in spaces |
| 148 | } |
| 149 | |
| 150 | @[inline] |
| 151 | fn is_not_space(in_char u8) bool { |
| 152 | return !is_space(in_char) |
| 153 | } |
| 154 | |
| 155 | @[inline] |
| 156 | fn is_digit(in_char u8) bool { |
| 157 | tmp := in_char - `0` |
| 158 | return tmp <= 0x09 |
| 159 | } |
| 160 | |
| 161 | @[inline] |
| 162 | fn is_not_digit(in_char u8) bool { |
| 163 | return !is_digit(in_char) |
| 164 | } |
| 165 | |
| 166 | /* |
| 167 | @[inline] |
| 168 | fn is_wordchar(in_char byte) bool { |
| 169 | return is_alnum(in_char) || in_char == `_` |
| 170 | } |
| 171 | |
| 172 | @[inline] |
| 173 | fn is_not_wordchar(in_char byte) bool { |
| 174 | return !is_alnum(in_char) |
| 175 | } |
| 176 | */ |
| 177 | |
| 178 | @[inline] |
| 179 | fn is_lower(in_char u8) bool { |
| 180 | tmp := in_char - `a` |
| 181 | return tmp <= 25 |
| 182 | } |
| 183 | |
| 184 | @[inline] |
| 185 | fn is_upper(in_char u8) bool { |
| 186 | tmp := in_char - `A` |
| 187 | return tmp <= 25 |
| 188 | } |
| 189 | |
| 190 | pub fn (re &RE) get_parse_error_string(err int) string { |
| 191 | match err { |
| 192 | compile_ok { return 'compile_ok' } |
| 193 | no_match_found { return 'no_match_found' } |
| 194 | err_char_unknown { return 'err_char_unknown' } |
| 195 | err_undefined { return 'err_undefined' } |
| 196 | err_internal_error { return 'err_internal_error' } |
| 197 | err_cc_alloc_overflow { return 'err_cc_alloc_overflow' } |
| 198 | err_syntax_error { return 'err_syntax_error' } |
| 199 | err_groups_overflow { return 'err_groups_overflow' } |
| 200 | err_groups_max_nested { return 'err_groups_max_nested' } |
| 201 | err_group_not_balanced { return 'err_group_not_balanced' } |
| 202 | err_group_qm_notation { return 'err_group_qm_notation' } |
| 203 | err_invalid_or_with_cc { return 'err_invalid_or_with_cc' } |
| 204 | err_neg_group_quantifier { return 'err_neg_group_quantifier' } |
| 205 | err_consecutive_dots { return 'err_consecutive_dots' } |
| 206 | else { return 'err_unknown' } |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | // utf8_str convert and utf8 sequence to a printable string |
| 211 | @[inline] |
| 212 | fn utf8_str(ch rune) string { |
| 213 | mut i := 4 |
| 214 | mut res := '' |
| 215 | for i > 0 { |
| 216 | v := u8((ch >> ((i - 1) * 8)) & 0xFF) |
| 217 | if v != 0 { |
| 218 | res += '${v:1c}' |
| 219 | } |
| 220 | i-- |
| 221 | } |
| 222 | return res |
| 223 | } |
| 224 | |
| 225 | // simple_log default log function |
| 226 | fn simple_log(txt string) { |
| 227 | print(txt) |
| 228 | } |
| 229 | |
| 230 | /****************************************************************************** |
| 231 | * |
| 232 | * Token Structs |
| 233 | * |
| 234 | ******************************************************************************/ |
| 235 | pub type FnValidator = fn (u8) bool |
| 236 | |
| 237 | struct Token { |
| 238 | mut: |
| 239 | ist u32 |
| 240 | // char |
| 241 | ch rune // char of the token if any |
| 242 | ch_len u8 // char len |
| 243 | flag u8 // flag for general usage |
| 244 | // Quantifiers / branch |
| 245 | rep_min int // used also for jump next in the OR branch [no match] pc jump |
| 246 | rep_max int // used also for jump next in the OR branch [ match] pc jump |
| 247 | greedy bool // greedy quantifier flag |
| 248 | // Char class |
| 249 | cc_index int = -1 |
| 250 | // counters for quantifier check (repetitions) |
| 251 | rep int |
| 252 | // validator function pointer |
| 253 | validator FnValidator = unsafe { nil } |
| 254 | // groups variables |
| 255 | group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion |
| 256 | group_rep int // repetition of the group |
| 257 | group_id int = -1 // id of the group |
| 258 | goto_pc int = -1 // jump to this PC if is needed |
| 259 | // OR flag for the token |
| 260 | next_is_or bool // true if the next token is an OR |
| 261 | // dot_char token variables |
| 262 | dot_check_pc int = -1 // pc of the next token to check for dots |
| 263 | bsls_check_pc int = -1 // pc of the next token to check for bsls |
| 264 | cc_check_pc int = -1 // pc of the next token to check for CC |
| 265 | last_dot_flag bool // if true indicate that is the last dot_char in the regex |
| 266 | // debug fields |
| 267 | source_index int |
| 268 | } |
| 269 | |
| 270 | @[inline] |
| 271 | fn (mut tok Token) reset() { |
| 272 | tok.rep = 0 |
| 273 | } |
| 274 | |
| 275 | /****************************************************************************** |
| 276 | * |
| 277 | * Regex struct |
| 278 | * |
| 279 | ******************************************************************************/ |
| 280 | pub const f_nl = 0x00000001 // end the match when find a new line symbol |
| 281 | pub const f_ms = 0x00000002 // match true only if the match is at the start of the string |
| 282 | pub const f_me = 0x00000004 // match true only if the match is at the end of the string |
| 283 | pub const f_efm = 0x00000100 // exit on first token matched, used by search |
| 284 | pub const f_bin = 0x00000200 // work only on bytes, ignore utf-8 |
| 285 | pub const f_ci = 0x00000400 // compare ASCII letters without case sensitivity |
| 286 | |
| 287 | // behaviour modifier flags |
| 288 | pub const f_src = 0x00020000 |
| 289 | |
| 290 | // Log function prototype |
| 291 | pub type FnLog = fn (string) |
| 292 | |
| 293 | pub struct RE { |
| 294 | pub mut: |
| 295 | prog []Token |
| 296 | prog_len int // regex program len |
| 297 | // char classes storage |
| 298 | cc []CharClass // char class list |
| 299 | cc_index int // index |
| 300 | // groups |
| 301 | group_count int // number of groups in this regex struct |
| 302 | groups []int // groups index results |
| 303 | group_max_nested int = 3 // max nested group |
| 304 | group_max int = 8 // max allowed number of different groups |
| 305 | |
| 306 | state_list []StateObj |
| 307 | |
| 308 | group_csave_flag bool // flag to enable continuous saving |
| 309 | group_csave []int //= []int{} // groups continuous save list |
| 310 | |
| 311 | group_map map[string]int // groups names map |
| 312 | |
| 313 | group_stack []int |
| 314 | group_data []int |
| 315 | // flags |
| 316 | flag int // flag for optional parameters |
| 317 | // Debug/log |
| 318 | debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE |
| 319 | log_func FnLog = simple_log // log function, can be customized by the user |
| 320 | query string // query string |
| 321 | } |
| 322 | |
| 323 | // Reset RE object |
| 324 | @[direct_array_access; inline] |
| 325 | pub fn (mut re RE) reset() { |
| 326 | re.cc_index = 0 |
| 327 | |
| 328 | mut i := 0 |
| 329 | for i < re.prog_len { |
| 330 | re.prog[i].group_rep = 0 // clear repetition of the group |
| 331 | re.prog[i].rep = 0 // clear repetition of the token |
| 332 | i++ |
| 333 | } |
| 334 | |
| 335 | // init groups array |
| 336 | if re.group_count > 0 { |
| 337 | if re.groups.len == 0 { |
| 338 | // first run alloc memory |
| 339 | re.groups = []int{len: re.group_count * 2, init: -1} |
| 340 | } else { |
| 341 | // subsequent executions, only clean up the memory |
| 342 | i = 0 |
| 343 | for i < re.groups.len { |
| 344 | re.groups[i] = -1 |
| 345 | i++ |
| 346 | } |
| 347 | } |
| 348 | } |
| 349 | |
| 350 | // reset group_csave |
| 351 | if re.group_csave_flag == true { |
| 352 | re.group_csave.clear() // = []int{} |
| 353 | } |
| 354 | |
| 355 | // reset state list |
| 356 | re.state_list.clear() |
| 357 | // restore initial state of the stack |
| 358 | for mut x in re.group_stack { |
| 359 | x = -1 |
| 360 | } |
| 361 | } |
| 362 | |
| 363 | // reset for search mode fail |
| 364 | // gcc bug, dont use @[inline] or go 5 time slower |
| 365 | //[inline] |
| 366 | @[direct_array_access] |
| 367 | fn (mut re RE) reset_src() { |
| 368 | mut i := 0 |
| 369 | for i < re.prog_len { |
| 370 | re.prog[i].group_rep = 0 // clear repetition of the group |
| 371 | re.prog[i].rep = 0 // clear repetition of the token |
| 372 | i++ |
| 373 | } |
| 374 | } |
| 375 | |
| 376 | /****************************************************************************** |
| 377 | * |
| 378 | * Backslashes chars |
| 379 | * |
| 380 | ******************************************************************************/ |
| 381 | struct BslsStruct { |
| 382 | ch rune // meta char |
| 383 | validator FnValidator = unsafe { nil } // validator function pointer |
| 384 | } |
| 385 | |
| 386 | const bsls_validator_array = [ |
| 387 | BslsStruct{`w`, is_alnum}, |
| 388 | BslsStruct{`W`, is_not_alnum}, |
| 389 | BslsStruct{`s`, is_space}, |
| 390 | BslsStruct{`S`, is_not_space}, |
| 391 | BslsStruct{`d`, is_digit}, |
| 392 | BslsStruct{`D`, is_not_digit}, |
| 393 | BslsStruct{`a`, is_lower}, |
| 394 | BslsStruct{`A`, is_upper}, |
| 395 | ] |
| 396 | |
| 397 | // these chars are escape if preceded by a \ |
| 398 | const bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`, |
| 399 | `^`, `!`] |
| 400 | |
| 401 | enum BSLS_parse_state { |
| 402 | start |
| 403 | bsls_found |
| 404 | bsls_char |
| 405 | normal_char |
| 406 | hex_char |
| 407 | } |
| 408 | |
| 409 | // parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present |
| 410 | fn (re &RE) parse_bsls(in_txt string, in_i int) (int, int, u32) { |
| 411 | mut status := BSLS_parse_state.start |
| 412 | mut i := in_i |
| 413 | mut hex_max_len := 2 |
| 414 | mut hex_res := u32(0) |
| 415 | mut hex_count := 0 |
| 416 | |
| 417 | for i < in_txt.len { |
| 418 | // get our char |
| 419 | char_tmp, char_len := re.get_char(in_txt, i) |
| 420 | ch := u8(char_tmp) |
| 421 | // println("ch [${ch:c}]") |
| 422 | |
| 423 | if status == .start && ch == `\\` { |
| 424 | status = .bsls_found |
| 425 | i += char_len |
| 426 | continue |
| 427 | } |
| 428 | |
| 429 | // check if is our bsls char, for now only one length sequence |
| 430 | if status == .bsls_found { |
| 431 | for c, x in bsls_validator_array { |
| 432 | if x.ch == ch { |
| 433 | return c, i - in_i + 1, hex_res |
| 434 | } |
| 435 | } |
| 436 | |
| 437 | // check for \x00 hex 8bit |
| 438 | if ch == `x` { |
| 439 | status = .hex_char |
| 440 | hex_max_len = 2 |
| 441 | i += char_len |
| 442 | continue |
| 443 | } |
| 444 | |
| 445 | // check for \x00 hex 16bit |
| 446 | if ch == `X` { |
| 447 | status = .hex_char |
| 448 | hex_max_len = 4 |
| 449 | i += char_len |
| 450 | continue |
| 451 | } |
| 452 | |
| 453 | status = .normal_char |
| 454 | continue |
| 455 | } |
| 456 | |
| 457 | // manage hex byte |
| 458 | if status == .hex_char { |
| 459 | if ch >= `0` && ch <= `9` { |
| 460 | hex_count++ |
| 461 | hex_res <<= 4 |
| 462 | hex_res += u32(ch - `0`) |
| 463 | i += char_len |
| 464 | } else if ch >= `A` && ch <= `F` { |
| 465 | hex_count++ |
| 466 | hex_res <<= 4 |
| 467 | hex_res += u32(ch - `A` + 10) |
| 468 | i += char_len |
| 469 | } else if ch >= `a` && ch <= `f` { |
| 470 | hex_count++ |
| 471 | hex_res <<= 4 |
| 472 | hex_res += u32(ch - `a` + 10) |
| 473 | i += char_len |
| 474 | } else { |
| 475 | return err_syntax_error, i, hex_res |
| 476 | } |
| 477 | |
| 478 | // println("hex_res: ${hex_res:08x} hex_count: ${hex_count}") |
| 479 | |
| 480 | // look for more hex digits |
| 481 | if hex_count < hex_max_len { |
| 482 | continue |
| 483 | } |
| 484 | |
| 485 | // if over 8 nibble is more than 32 bit, error |
| 486 | if hex_count > hex_max_len { |
| 487 | return err_syntax_error, i - in_i, hex_res |
| 488 | } |
| 489 | |
| 490 | if hex_count == hex_max_len { |
| 491 | // we have a good result |
| 492 | // println("RESULT hex_res: ${hex_res:08x} hex_count: ${hex_count}") |
| 493 | return -2, i - in_i, hex_res |
| 494 | } |
| 495 | |
| 496 | // MUST NOT BE HERE! |
| 497 | return err_syntax_error, i, hex_res |
| 498 | } |
| 499 | |
| 500 | // no BSLS validator, manage as normal escape char char |
| 501 | if status == .normal_char { |
| 502 | if ch in bsls_escape_list { |
| 503 | return no_match_found, i - in_i + 1, hex_res |
| 504 | } |
| 505 | return err_syntax_error, i - in_i + 1, hex_res |
| 506 | } |
| 507 | |
| 508 | // at the present time we manage only one char after the \ |
| 509 | break |
| 510 | } |
| 511 | // not our bsls return KO |
| 512 | return err_syntax_error, i, hex_res |
| 513 | } |
| 514 | |
| 515 | /****************************************************************************** |
| 516 | * |
| 517 | * Char class |
| 518 | * |
| 519 | ******************************************************************************/ |
| 520 | const cc_null = 0 // empty cc token |
| 521 | |
| 522 | const cc_char = 1 // simple char: a |
| 523 | |
| 524 | const cc_int = 2 // char interval: a-z |
| 525 | |
| 526 | const cc_bsls = 3 // backslash char |
| 527 | |
| 528 | const cc_end = 4 |
| 529 | |
| 530 | struct CharClass { |
| 531 | mut: |
| 532 | cc_type int // type of cc token |
| 533 | ch0 rune // first char of the interval a-b a in this case |
| 534 | ch1 rune // second char of the interval a-b b in this case |
| 535 | validator FnValidator = unsafe { nil } // validator function pointer |
| 536 | } |
| 537 | |
| 538 | enum CharClass_parse_state { |
| 539 | start |
| 540 | in_char |
| 541 | in_bsls |
| 542 | separator |
| 543 | finish |
| 544 | } |
| 545 | |
| 546 | fn (re &RE) get_char_class(pc int) string { |
| 547 | buf := []u8{len: (re.cc.len)} |
| 548 | mut buf_ptr := unsafe { &u8(&buf) } |
| 549 | |
| 550 | mut cc_i := re.prog[pc].cc_index |
| 551 | mut i := 0 |
| 552 | mut tmp := 0 |
| 553 | for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end { |
| 554 | if re.cc[cc_i].cc_type == cc_bsls { |
| 555 | unsafe { |
| 556 | buf_ptr[i] = `\\` |
| 557 | i++ |
| 558 | buf_ptr[i] = u8(re.cc[cc_i].ch0) |
| 559 | i++ |
| 560 | } |
| 561 | } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 { |
| 562 | tmp = 3 |
| 563 | for tmp >= 0 { |
| 564 | x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) |
| 565 | if x != 0 { |
| 566 | unsafe { |
| 567 | buf_ptr[i] = x |
| 568 | i++ |
| 569 | } |
| 570 | } |
| 571 | tmp-- |
| 572 | } |
| 573 | } else { |
| 574 | tmp = 3 |
| 575 | for tmp >= 0 { |
| 576 | x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) |
| 577 | if x != 0 { |
| 578 | unsafe { |
| 579 | buf_ptr[i] = x |
| 580 | i++ |
| 581 | } |
| 582 | } |
| 583 | tmp-- |
| 584 | } |
| 585 | unsafe { |
| 586 | buf_ptr[i] = `-` |
| 587 | i++ |
| 588 | } |
| 589 | tmp = 3 |
| 590 | for tmp >= 0 { |
| 591 | x := u8((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF) |
| 592 | if x != 0 { |
| 593 | unsafe { |
| 594 | buf_ptr[i] = x |
| 595 | i++ |
| 596 | } |
| 597 | } |
| 598 | tmp-- |
| 599 | } |
| 600 | } |
| 601 | cc_i++ |
| 602 | } |
| 603 | unsafe { |
| 604 | buf_ptr[i] = u8(0) |
| 605 | } |
| 606 | return unsafe { tos_clone(buf_ptr) } |
| 607 | } |
| 608 | |
| 609 | fn (re &RE) check_char_class(pc int, ch rune) bool { |
| 610 | mut cc_i := re.prog[pc].cc_index |
| 611 | for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end { |
| 612 | if re.cc[cc_i].cc_type == cc_bsls { |
| 613 | if re.validator_matches(re.cc[cc_i].validator, ch) { |
| 614 | return true |
| 615 | } |
| 616 | } else if re.char_in_range(ch, re.cc[cc_i].ch0, re.cc[cc_i].ch1) { |
| 617 | return true |
| 618 | } |
| 619 | cc_i++ |
| 620 | } |
| 621 | return false |
| 622 | } |
| 623 | |
| 624 | @[inline] |
| 625 | fn (re &RE) chars_equal(a rune, b rune) bool { |
| 626 | if a == b { |
| 627 | return true |
| 628 | } |
| 629 | if (re.flag & f_ci) == 0 { |
| 630 | return false |
| 631 | } |
| 632 | lower_a := re.case_transform_char(a, false) |
| 633 | lower_b := re.case_transform_char(b, false) |
| 634 | if lower_a == lower_b { |
| 635 | return true |
| 636 | } |
| 637 | return re.case_transform_char(a, true) == re.case_transform_char(b, true) |
| 638 | } |
| 639 | |
| 640 | @[inline] |
| 641 | fn (re &RE) char_in_range(ch rune, start rune, end rune) bool { |
| 642 | if ch >= start && ch <= end { |
| 643 | return true |
| 644 | } |
| 645 | if (re.flag & f_ci) == 0 { |
| 646 | return false |
| 647 | } |
| 648 | lower := re.case_transform_char(ch, false) |
| 649 | if lower != ch && lower >= start && lower <= end { |
| 650 | return true |
| 651 | } |
| 652 | upper := re.case_transform_char(ch, true) |
| 653 | return upper != ch && upper >= start && upper <= end |
| 654 | } |
| 655 | |
| 656 | @[inline] |
| 657 | fn (re &RE) validator_matches(validator FnValidator, ch rune) bool { |
| 658 | if validator(u8(ch)) { |
| 659 | return true |
| 660 | } |
| 661 | if (re.flag & f_ci) == 0 { |
| 662 | return false |
| 663 | } |
| 664 | lower := re.case_transform_char(ch, false) |
| 665 | if lower != ch && validator(u8(lower)) { |
| 666 | return true |
| 667 | } |
| 668 | upper := re.case_transform_char(ch, true) |
| 669 | return upper != ch && validator(u8(upper)) |
| 670 | } |
| 671 | |
| 672 | fn (re &RE) case_transform_char(ch rune, upper bool) rune { |
| 673 | if upper { |
| 674 | if ch >= `a` && ch <= `z` { |
| 675 | return ch - 32 |
| 676 | } |
| 677 | return ch |
| 678 | } |
| 679 | if ch >= `A` && ch <= `Z` { |
| 680 | return ch + 32 |
| 681 | } |
| 682 | return ch |
| 683 | } |
| 684 | |
| 685 | @[inline] |
| 686 | fn (re &RE) token_matches(pc int, ch rune) bool { |
| 687 | return match re.prog[pc].ist { |
| 688 | ist_simple_char { re.chars_equal(re.prog[pc].ch, ch) } |
| 689 | ist_char_class_pos { re.check_char_class(pc, ch) } |
| 690 | ist_char_class_neg { !re.check_char_class(pc, ch) } |
| 691 | ist_bsls_char { re.validator_matches(re.prog[pc].validator, ch) } |
| 692 | else { false } |
| 693 | } |
| 694 | } |
| 695 | |
| 696 | // parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char |
| 697 | fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, u32) { |
| 698 | mut status := CharClass_parse_state.start |
| 699 | mut i := in_i |
| 700 | |
| 701 | mut tmp_index := re.cc_index |
| 702 | res_index := re.cc_index |
| 703 | |
| 704 | mut cc_type := u32(ist_char_class_pos) |
| 705 | |
| 706 | for i < in_txt.len { |
| 707 | // check if we are out of memory for char classes |
| 708 | if tmp_index >= re.cc.len { |
| 709 | return err_cc_alloc_overflow, 0, u32(0) |
| 710 | } |
| 711 | |
| 712 | // get our char |
| 713 | char_tmp, char_len := re.get_char(in_txt, i) |
| 714 | ch := u8(char_tmp) |
| 715 | |
| 716 | // println("CC #${i:3d} ch: ${ch:c}") |
| 717 | |
| 718 | // negation |
| 719 | if status == .start && ch == `^` { |
| 720 | cc_type = u32(ist_char_class_neg) |
| 721 | i += char_len |
| 722 | continue |
| 723 | } |
| 724 | |
| 725 | // minus symbol |
| 726 | if status == .start && ch == `-` { |
| 727 | re.cc[tmp_index].cc_type = cc_char |
| 728 | re.cc[tmp_index].ch0 = char_tmp |
| 729 | re.cc[tmp_index].ch1 = char_tmp |
| 730 | i += char_len |
| 731 | tmp_index++ |
| 732 | continue |
| 733 | } |
| 734 | |
| 735 | // bsls |
| 736 | if (status == .start || status == .in_char) && ch == `\\` { |
| 737 | // println("CC bsls.") |
| 738 | status = .in_bsls |
| 739 | i += char_len |
| 740 | continue |
| 741 | } |
| 742 | |
| 743 | if status == .in_bsls { |
| 744 | // println("CC bsls validation.") |
| 745 | for c, x in bsls_validator_array { |
| 746 | if x.ch == ch { |
| 747 | // println("CC bsls found [${ch:c}]") |
| 748 | re.cc[tmp_index].cc_type = cc_bsls |
| 749 | re.cc[tmp_index].ch0 = bsls_validator_array[c].ch |
| 750 | re.cc[tmp_index].ch1 = bsls_validator_array[c].ch |
| 751 | re.cc[tmp_index].validator = bsls_validator_array[c].validator |
| 752 | i += char_len |
| 753 | tmp_index++ |
| 754 | status = .in_char |
| 755 | break |
| 756 | } |
| 757 | } |
| 758 | if status == .in_bsls { |
| 759 | // manage as a simple char |
| 760 | // println("CC bsls not found [${ch:c}]") |
| 761 | re.cc[tmp_index].cc_type = cc_char |
| 762 | re.cc[tmp_index].ch0 = char_tmp |
| 763 | re.cc[tmp_index].ch1 = char_tmp |
| 764 | i += char_len |
| 765 | tmp_index++ |
| 766 | status = .in_char |
| 767 | continue |
| 768 | } else { |
| 769 | continue |
| 770 | } |
| 771 | } |
| 772 | |
| 773 | // simple char |
| 774 | if (status == .start || status == .in_char) && ch != `-` && ch != `]` { |
| 775 | status = .in_char |
| 776 | |
| 777 | re.cc[tmp_index].cc_type = cc_char |
| 778 | re.cc[tmp_index].ch0 = char_tmp |
| 779 | re.cc[tmp_index].ch1 = char_tmp |
| 780 | |
| 781 | i += char_len |
| 782 | tmp_index++ |
| 783 | continue |
| 784 | } |
| 785 | |
| 786 | // check range separator |
| 787 | if status == .in_char && ch == `-` { |
| 788 | status = .separator |
| 789 | i += char_len |
| 790 | continue |
| 791 | } |
| 792 | |
| 793 | // check range end |
| 794 | if status == .separator && ch != `]` && ch != `-` { |
| 795 | status = .in_char |
| 796 | re.cc[tmp_index - 1].cc_type = cc_int |
| 797 | re.cc[tmp_index - 1].ch1 = char_tmp |
| 798 | i += char_len |
| 799 | continue |
| 800 | } |
| 801 | |
| 802 | // char class end |
| 803 | if status == .in_char && ch == `]` { |
| 804 | re.cc[tmp_index].cc_type = cc_end |
| 805 | re.cc[tmp_index].ch0 = 0 |
| 806 | re.cc[tmp_index].ch1 = 0 |
| 807 | re.cc_index = tmp_index + 1 |
| 808 | |
| 809 | return res_index, i - in_i + 2, cc_type |
| 810 | } |
| 811 | |
| 812 | i++ |
| 813 | } |
| 814 | return err_syntax_error, 0, u32(0) |
| 815 | } |
| 816 | |
| 817 | /****************************************************************************** |
| 818 | * |
| 819 | * Re Compiler |
| 820 | * |
| 821 | ******************************************************************************/ |
| 822 | // |
| 823 | // Quantifier |
| 824 | // |
| 825 | enum Quant_parse_state { |
| 826 | start |
| 827 | min_parse |
| 828 | comma_checked |
| 829 | max_parse |
| 830 | greedy |
| 831 | gredy_parse |
| 832 | finish |
| 833 | } |
| 834 | |
| 835 | // parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char |
| 836 | fn (re &RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { |
| 837 | mut status := Quant_parse_state.start |
| 838 | mut i := in_i |
| 839 | |
| 840 | mut q_min := 0 // default min in a {} quantifier is 1 |
| 841 | mut q_max := 0 // default max in a {} quantifier is max_quantifier |
| 842 | |
| 843 | mut ch := u8(0) |
| 844 | |
| 845 | for i < in_txt.len { |
| 846 | unsafe { |
| 847 | ch = in_txt.str[i] |
| 848 | } |
| 849 | // println("${ch:c} status: ${status}") |
| 850 | |
| 851 | // exit on no compatible char with {} quantifier |
| 852 | if utf8util_char_len(ch) != 1 { |
| 853 | return err_syntax_error, i, 0, false |
| 854 | } |
| 855 | |
| 856 | // min parsing skip if comma present |
| 857 | if status == .start && ch == `,` { |
| 858 | q_min = 0 // default min in a {} quantifier is 0 |
| 859 | status = .comma_checked |
| 860 | i++ |
| 861 | continue |
| 862 | } |
| 863 | |
| 864 | if status == .start && is_digit(ch) { |
| 865 | status = .min_parse |
| 866 | q_min *= 10 |
| 867 | q_min += int(ch - `0`) |
| 868 | i++ |
| 869 | continue |
| 870 | } |
| 871 | |
| 872 | if status == .min_parse && is_digit(ch) { |
| 873 | q_min *= 10 |
| 874 | q_min += int(ch - `0`) |
| 875 | i++ |
| 876 | continue |
| 877 | } |
| 878 | |
| 879 | // we have parsed the min, now check the max |
| 880 | if status == .min_parse && ch == `,` { |
| 881 | status = .comma_checked |
| 882 | i++ |
| 883 | continue |
| 884 | } |
| 885 | |
| 886 | // single value {4} |
| 887 | if status == .min_parse && ch == `}` { |
| 888 | q_max = q_min |
| 889 | status = .greedy |
| 890 | continue |
| 891 | } |
| 892 | |
| 893 | // end without max |
| 894 | if status == .comma_checked && ch == `}` { |
| 895 | q_max = max_quantifier |
| 896 | status = .greedy |
| 897 | continue |
| 898 | } |
| 899 | |
| 900 | // start max parsing |
| 901 | if status == .comma_checked && is_digit(ch) { |
| 902 | status = .max_parse |
| 903 | q_max *= 10 |
| 904 | q_max += int(ch - `0`) |
| 905 | i++ |
| 906 | continue |
| 907 | } |
| 908 | |
| 909 | // parse the max |
| 910 | if status == .max_parse && is_digit(ch) { |
| 911 | q_max *= 10 |
| 912 | q_max += int(ch - `0`) |
| 913 | i++ |
| 914 | continue |
| 915 | } |
| 916 | |
| 917 | // finished the quantifier |
| 918 | if status == .max_parse && ch == `}` { |
| 919 | status = .greedy |
| 920 | continue |
| 921 | } |
| 922 | |
| 923 | // check if greedy flag char ? is present |
| 924 | if status == .greedy { |
| 925 | if i + 1 < in_txt.len { |
| 926 | i++ |
| 927 | status = .gredy_parse |
| 928 | continue |
| 929 | } |
| 930 | return q_min, q_max, i - in_i + 2, false |
| 931 | } |
| 932 | |
| 933 | // check the greedy flag |
| 934 | if status == .gredy_parse { |
| 935 | if ch == `?` { |
| 936 | return q_min, q_max, i - in_i + 2, true |
| 937 | } else { |
| 938 | i-- |
| 939 | return q_min, q_max, i - in_i + 2, false |
| 940 | } |
| 941 | } |
| 942 | |
| 943 | // not a {} quantifier, exit |
| 944 | return err_syntax_error, i, 0, false |
| 945 | } |
| 946 | |
| 947 | // not a conform {} quantifier |
| 948 | return err_syntax_error, i, 0, false |
| 949 | } |
| 950 | |
| 951 | // |
| 952 | // Groups |
| 953 | // |
| 954 | enum Group_parse_state { |
| 955 | start |
| 956 | q_mark // (? |
| 957 | q_mark1 // (?:|P checking |
| 958 | p_status // (?P |
| 959 | p_start // (?P< |
| 960 | p_end // (?P<...> |
| 961 | p_in_name // (?P<... |
| 962 | finish |
| 963 | } |
| 964 | |
| 965 | // parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index) |
| 966 | fn (re &RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) { |
| 967 | mut status := Group_parse_state.start |
| 968 | mut i := in_i |
| 969 | mut name := '' |
| 970 | |
| 971 | for i < in_txt.len && status != .finish { |
| 972 | // get our char |
| 973 | char_tmp, char_len := re.get_char(in_txt, i) |
| 974 | ch := u8(char_tmp) |
| 975 | |
| 976 | // start |
| 977 | if status == .start && ch == `(` { |
| 978 | status = .q_mark |
| 979 | i += char_len |
| 980 | continue |
| 981 | } |
| 982 | |
| 983 | // check for question marks |
| 984 | if status == .q_mark && ch == `?` { |
| 985 | status = .q_mark1 |
| 986 | i += char_len |
| 987 | continue |
| 988 | } |
| 989 | |
| 990 | // negate group |
| 991 | if status == .q_mark1 && ch == `!` { |
| 992 | i += char_len |
| 993 | return 0, false, true, name, i |
| 994 | } |
| 995 | |
| 996 | // non capturing group |
| 997 | if status == .q_mark1 && ch == `:` { |
| 998 | i += char_len |
| 999 | return 0, false, false, name, i |
| 1000 | } |
| 1001 | |
| 1002 | // enter in P section |
| 1003 | if status == .q_mark1 && ch == `P` { |
| 1004 | status = .p_status |
| 1005 | i += char_len |
| 1006 | continue |
| 1007 | } |
| 1008 | |
| 1009 | // not a valid q mark found |
| 1010 | if status == .q_mark1 { |
| 1011 | // println("NO VALID Q MARK") |
| 1012 | return -2, true, false, name, i |
| 1013 | } |
| 1014 | |
| 1015 | if status == .p_status && ch == `<` { |
| 1016 | status = .p_start |
| 1017 | i += char_len |
| 1018 | continue |
| 1019 | } |
| 1020 | |
| 1021 | if status == .p_start && ch != `>` { |
| 1022 | status = .p_in_name |
| 1023 | name += '${ch:1c}' // TODO: manage utf8 chars |
| 1024 | i += char_len |
| 1025 | continue |
| 1026 | } |
| 1027 | |
| 1028 | // colect name |
| 1029 | if status == .p_in_name && ch != `>` && is_alnum(ch) { |
| 1030 | name += '${ch:1c}' // TODO: manage utf8 chars |
| 1031 | i += char_len |
| 1032 | continue |
| 1033 | } |
| 1034 | |
| 1035 | // end name |
| 1036 | if status == .p_in_name && ch == `>` { |
| 1037 | i += char_len |
| 1038 | return 0, true, false, name, i |
| 1039 | } |
| 1040 | |
| 1041 | // error on name group |
| 1042 | if status == .p_in_name { |
| 1043 | return -2, true, false, name, i |
| 1044 | } |
| 1045 | |
| 1046 | // normal group, nothig to do, exit |
| 1047 | return 0, true, false, name, i |
| 1048 | } |
| 1049 | // UNREACHABLE |
| 1050 | // println("ERROR!! NOT MEANT TO BE HERE!!1") |
| 1051 | return -2, true, false, name, i |
| 1052 | } |
| 1053 | |
| 1054 | const quntifier_chars = [rune(`+`), `*`, `?`, `{`] |
| 1055 | |
| 1056 | // |
| 1057 | // main compiler |
| 1058 | // |
| 1059 | // compile return (return code, index) where index is the index of the error in the query string if return code is an error code |
| 1060 | fn (mut re RE) impl_compile(in_txt string) (int, int) { |
| 1061 | mut i := 0 // input string index |
| 1062 | mut pc := 0 // program counter |
| 1063 | |
| 1064 | // group management variables |
| 1065 | mut group_count := -1 |
| 1066 | mut group_stack := []int{len: re.group_max_nested, init: 0} |
| 1067 | mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1} |
| 1068 | mut group_stack_index := -1 |
| 1069 | |
| 1070 | re.query = in_txt // save the query string |
| 1071 | |
| 1072 | i = 0 |
| 1073 | for i < in_txt.len { |
| 1074 | mut char_tmp := u32(0) |
| 1075 | mut char_len := 0 |
| 1076 | // println("i: ${i:3d} ch: ${in_txt.str[i]:c}") |
| 1077 | |
| 1078 | char_tmp, char_len = re.get_char(in_txt, i) |
| 1079 | |
| 1080 | // |
| 1081 | // check special cases: $ ^ |
| 1082 | // |
| 1083 | if char_len == 1 && i == 0 && u8(char_tmp) == `^` { |
| 1084 | re.flag |= f_ms |
| 1085 | i = i + char_len |
| 1086 | continue |
| 1087 | } |
| 1088 | if char_len == 1 && i == (in_txt.len - 1) && u8(char_tmp) == `$` { |
| 1089 | re.flag |= f_me |
| 1090 | i = i + char_len |
| 1091 | continue |
| 1092 | } |
| 1093 | |
| 1094 | // ist_group_start |
| 1095 | if char_len == 1 && pc >= 0 && u8(char_tmp) == `(` { |
| 1096 | // check max groups allowed |
| 1097 | if group_count > re.group_max { |
| 1098 | return err_groups_overflow, i + 1 |
| 1099 | } |
| 1100 | group_stack_index++ |
| 1101 | |
| 1102 | // check max nested groups allowed |
| 1103 | if group_stack_index > re.group_max_nested { |
| 1104 | return err_groups_max_nested, i + 1 |
| 1105 | } |
| 1106 | |
| 1107 | tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt, i) |
| 1108 | |
| 1109 | // manage question mark format error |
| 1110 | if tmp_res < -1 { |
| 1111 | return err_group_qm_notation, next_i |
| 1112 | } |
| 1113 | |
| 1114 | // println("Parse group: [${tmp_res}, ${cgroup_flag}, (${i},${next_i}), '${in_txt[i..next_i]}' ]") |
| 1115 | i = next_i |
| 1116 | |
| 1117 | if cgroup_flag == true { |
| 1118 | group_count++ |
| 1119 | } |
| 1120 | |
| 1121 | // calculate the group id |
| 1122 | // if it is a named group, recycle the group id |
| 1123 | // NOTE: **** the group index is +1 because map return 0 when not found!! **** |
| 1124 | mut group_id := group_count |
| 1125 | if cgroup_name.len > 0 { |
| 1126 | // println("GROUP NAME: ${cgroup_name}") |
| 1127 | if cgroup_name in re.group_map { |
| 1128 | group_id = re.group_map[cgroup_name] - 1 |
| 1129 | group_count-- |
| 1130 | } else { |
| 1131 | re.group_map[cgroup_name] = group_id + 1 |
| 1132 | } |
| 1133 | } |
| 1134 | |
| 1135 | group_stack_txt_index[group_stack_index] = i |
| 1136 | group_stack[group_stack_index] = pc |
| 1137 | |
| 1138 | re.prog[pc].ist = u32(0) | ist_group_start |
| 1139 | re.prog[pc].rep_min = 1 |
| 1140 | re.prog[pc].rep_max = 1 |
| 1141 | |
| 1142 | // manage negation groups |
| 1143 | if negate_flag == true { |
| 1144 | re.prog[pc].group_neg = true |
| 1145 | re.prog[pc].rep_min = 0 // may not be caught, but it is ok |
| 1146 | } |
| 1147 | |
| 1148 | // set the group id |
| 1149 | if cgroup_flag == false { |
| 1150 | // println("NO CAPTURE GROUP") |
| 1151 | re.prog[pc].group_id = -1 |
| 1152 | } else { |
| 1153 | re.prog[pc].group_id = group_id |
| 1154 | } |
| 1155 | |
| 1156 | pc = pc + 1 |
| 1157 | continue |
| 1158 | } |
| 1159 | |
| 1160 | // ist_group_end |
| 1161 | if char_len == 1 && pc > 0 && u8(char_tmp) == `)` { |
| 1162 | if group_stack_index < 0 { |
| 1163 | return err_group_not_balanced, i + 1 |
| 1164 | } |
| 1165 | |
| 1166 | goto_pc := group_stack[group_stack_index] |
| 1167 | group_stack_index-- |
| 1168 | |
| 1169 | re.prog[pc].ist = u32(0) | ist_group_end |
| 1170 | re.prog[pc].rep_min = 1 |
| 1171 | re.prog[pc].rep_max = 1 |
| 1172 | |
| 1173 | re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need |
| 1174 | re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data |
| 1175 | |
| 1176 | re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc |
| 1177 | // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data |
| 1178 | |
| 1179 | // duplicate the negation group info and settings |
| 1180 | if re.prog[goto_pc].group_neg == true { |
| 1181 | re.prog[pc].group_neg = re.prog[goto_pc].group_neg |
| 1182 | re.prog[pc].rep_min = re.prog[goto_pc].rep_min |
| 1183 | } |
| 1184 | |
| 1185 | pc = pc + 1 |
| 1186 | i = i + char_len |
| 1187 | continue |
| 1188 | } |
| 1189 | |
| 1190 | // ist_dot_char match any char except the following token |
| 1191 | if char_len == 1 && pc >= 0 && u8(char_tmp) == `.` { |
| 1192 | // consecutive ist_dot_char is a syntax error |
| 1193 | if pc > 0 && re.prog[pc - 1].ist == ist_dot_char { |
| 1194 | return err_consecutive_dots, i |
| 1195 | } |
| 1196 | |
| 1197 | re.prog[pc].ist = u32(0) | ist_dot_char |
| 1198 | re.prog[pc].rep_min = 1 |
| 1199 | re.prog[pc].rep_max = 1 |
| 1200 | pc = pc + 1 |
| 1201 | i = i + char_len |
| 1202 | continue |
| 1203 | } |
| 1204 | |
| 1205 | // OR branch |
| 1206 | if char_len == 1 && pc > 0 && u8(char_tmp) == `|` { |
| 1207 | if pc > 0 && re.prog[pc - 1].ist == ist_or_branch { |
| 1208 | return err_syntax_error, i |
| 1209 | } |
| 1210 | re.prog[pc].ist = u32(0) | ist_or_branch |
| 1211 | re.prog[pc].source_index = i |
| 1212 | pc = pc + 1 |
| 1213 | i = i + char_len |
| 1214 | continue |
| 1215 | } |
| 1216 | |
| 1217 | // Quantifiers |
| 1218 | if char_len == 1 && pc > 0 { |
| 1219 | mut char_next := rune(0) |
| 1220 | mut char_next_len := 0 |
| 1221 | if (char_len + i) < in_txt.len { |
| 1222 | char_next, char_next_len = re.get_char(in_txt, i + char_len) |
| 1223 | } |
| 1224 | mut quant_flag := true |
| 1225 | |
| 1226 | // negation groups can not have quantifiers |
| 1227 | if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] { |
| 1228 | return err_neg_group_quantifier, i |
| 1229 | } |
| 1230 | |
| 1231 | match u8(char_tmp) { |
| 1232 | `?` { |
| 1233 | // println("q: ${char_tmp:c}") |
| 1234 | // check illegal quantifier sequences |
| 1235 | if char_next_len == 1 && char_next in quntifier_chars { |
| 1236 | return err_syntax_error, i |
| 1237 | } |
| 1238 | re.prog[pc - 1].rep_min = 0 |
| 1239 | re.prog[pc - 1].rep_max = 1 |
| 1240 | } |
| 1241 | `+` { |
| 1242 | // println("q: ${char_tmp:c}") |
| 1243 | // check illegal quantifier sequences |
| 1244 | if char_next_len == 1 && char_next in quntifier_chars { |
| 1245 | return err_syntax_error, i |
| 1246 | } |
| 1247 | re.prog[pc - 1].rep_min = 1 |
| 1248 | re.prog[pc - 1].rep_max = max_quantifier |
| 1249 | } |
| 1250 | `*` { |
| 1251 | // println("q: ${char_tmp:c}") |
| 1252 | // check illegal quantifier sequences |
| 1253 | if char_next_len == 1 && char_next in quntifier_chars { |
| 1254 | return err_syntax_error, i |
| 1255 | } |
| 1256 | re.prog[pc - 1].rep_min = 0 |
| 1257 | re.prog[pc - 1].rep_max = max_quantifier |
| 1258 | } |
| 1259 | `{` { |
| 1260 | min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1) |
| 1261 | // it is a quantifier |
| 1262 | if min >= 0 { |
| 1263 | // println("{${min},${max}}\n str:[${in_txt[i..i+tmp]}] greedy:${greedy}") |
| 1264 | i = i + tmp |
| 1265 | re.prog[pc - 1].rep_min = min |
| 1266 | re.prog[pc - 1].rep_max = max |
| 1267 | re.prog[pc - 1].greedy = greedy |
| 1268 | // check illegal quantifier sequences |
| 1269 | if i <= in_txt.len { |
| 1270 | char_next, char_next_len = re.get_char(in_txt, i) |
| 1271 | if char_next_len == 1 && char_next in quntifier_chars { |
| 1272 | return err_syntax_error, i |
| 1273 | } |
| 1274 | } |
| 1275 | continue |
| 1276 | } else { |
| 1277 | return min, i |
| 1278 | } |
| 1279 | |
| 1280 | // TODO: decide if the open bracket can be conform without the close bracket |
| 1281 | /* |
| 1282 | // no conform, parse as normal char |
| 1283 | else { |
| 1284 | quant_flag = false |
| 1285 | } |
| 1286 | */ |
| 1287 | } |
| 1288 | else { |
| 1289 | quant_flag = false |
| 1290 | } |
| 1291 | } |
| 1292 | |
| 1293 | if quant_flag { |
| 1294 | i = i + char_len |
| 1295 | continue |
| 1296 | } |
| 1297 | } |
| 1298 | |
| 1299 | // IST_CHAR_CLASS_* |
| 1300 | if char_len == 1 && pc >= 0 { |
| 1301 | if u8(char_tmp) == `[` { |
| 1302 | cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1) |
| 1303 | if cc_index >= 0 { |
| 1304 | // println("index: ${cc_index} str:${in_txt[i..i+tmp]}") |
| 1305 | i = i + tmp |
| 1306 | re.prog[pc].ist = u32(0) | cc_type |
| 1307 | re.prog[pc].cc_index = cc_index |
| 1308 | re.prog[pc].rep_min = 1 |
| 1309 | re.prog[pc].rep_max = 1 |
| 1310 | pc = pc + 1 |
| 1311 | continue |
| 1312 | } |
| 1313 | // cc_class vector memory full |
| 1314 | else if cc_index < 0 { |
| 1315 | return cc_index, i |
| 1316 | } |
| 1317 | } |
| 1318 | } |
| 1319 | |
| 1320 | // ist_bsls_char |
| 1321 | if char_len == 1 && pc >= 0 { |
| 1322 | if u8(char_tmp) == `\\` { |
| 1323 | // if the index is negative: |
| 1324 | // -1 ERROR |
| 1325 | // -2 hex byte code BLSL |
| 1326 | bsls_index, tmp, hex_res := re.parse_bsls(in_txt, i) |
| 1327 | if bsls_index >= 0 { |
| 1328 | i = i + tmp |
| 1329 | re.prog[pc].ist = u32(0) | ist_bsls_char |
| 1330 | re.prog[pc].rep_min = 1 |
| 1331 | re.prog[pc].rep_max = 1 |
| 1332 | re.prog[pc].validator = bsls_validator_array[bsls_index].validator |
| 1333 | re.prog[pc].ch = bsls_validator_array[bsls_index].ch |
| 1334 | pc = pc + 1 |
| 1335 | continue |
| 1336 | } |
| 1337 | // hex char |
| 1338 | // this code can mange up to \x for 32 bit |
| 1339 | // at the present time only 8/16 bit are used |
| 1340 | else if bsls_index == -2 { |
| 1341 | mut value := hex_res |
| 1342 | mut value_list := []u32{cap: 4} |
| 1343 | mut count := 0 |
| 1344 | for value > 0 { |
| 1345 | value_list << value & 0xFF |
| 1346 | value = value >> 8 |
| 1347 | count++ |
| 1348 | } |
| 1349 | |
| 1350 | count-- |
| 1351 | for count >= 0 { |
| 1352 | re.prog[pc].ist = ist_simple_char |
| 1353 | re.prog[pc].ch = value_list[count] |
| 1354 | re.prog[pc].ch_len = u8(char_len) |
| 1355 | re.prog[pc].rep_min = 1 |
| 1356 | re.prog[pc].rep_max = 1 |
| 1357 | re.prog[pc].flag = 1 // state a byte char |
| 1358 | // println("char: ${char_tmp:c}") |
| 1359 | pc = pc + 1 |
| 1360 | count-- |
| 1361 | } |
| 1362 | i = i + tmp |
| 1363 | continue |
| 1364 | } |
| 1365 | // this is an escape char, skip the bsls and continue as a normal char |
| 1366 | else if bsls_index == no_match_found { |
| 1367 | i += char_len |
| 1368 | char_tmp, char_len = re.get_char(in_txt, i) |
| 1369 | // continue as simple char |
| 1370 | } |
| 1371 | // if not an escape or a bsls char then it is an error (at least for now!) |
| 1372 | else { |
| 1373 | return bsls_index, i + tmp |
| 1374 | } |
| 1375 | } |
| 1376 | } |
| 1377 | |
| 1378 | // ist_simple_char |
| 1379 | re.prog[pc].ist = ist_simple_char |
| 1380 | re.prog[pc].ch = char_tmp |
| 1381 | re.prog[pc].ch_len = u8(char_len) |
| 1382 | re.prog[pc].rep_min = 1 |
| 1383 | re.prog[pc].rep_max = 1 |
| 1384 | // println("char: ${char_tmp:c}") |
| 1385 | pc = pc + 1 |
| 1386 | |
| 1387 | i += char_len |
| 1388 | } |
| 1389 | |
| 1390 | // add end of the program |
| 1391 | re.prog[pc].ist = ist_prog_end |
| 1392 | re.prog_len = pc |
| 1393 | |
| 1394 | // check for unbalanced groups |
| 1395 | if group_stack_index != -1 { |
| 1396 | return err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1 |
| 1397 | } |
| 1398 | |
| 1399 | // check for OR at the end of the program |
| 1400 | if pc > 0 && re.prog[pc - 1].ist == ist_or_branch { |
| 1401 | return err_syntax_error, in_txt.len - 1 |
| 1402 | } |
| 1403 | |
| 1404 | // store the number of groups in the query |
| 1405 | re.group_count = group_count + 1 |
| 1406 | |
| 1407 | //****************************************** |
| 1408 | // Post processing |
| 1409 | //****************************************** |
| 1410 | |
| 1411 | // |
| 1412 | // manage ist_dot_char |
| 1413 | // |
| 1414 | |
| 1415 | // find the checks for dot chars, if any... |
| 1416 | mut pc1 := 0 |
| 1417 | mut dot_char_count := 0 |
| 1418 | mut last_dot_char_pc := -1 |
| 1419 | for pc1 < pc { |
| 1420 | if re.prog[pc1].ist == ist_dot_char { |
| 1421 | // println("Dot_char pc: ${pc1}") |
| 1422 | last_dot_char_pc = pc1 |
| 1423 | dot_char_count++ |
| 1424 | mut pc2 := pc1 + 1 |
| 1425 | for pc2 < pc { |
| 1426 | // consecutive dot chars is an error |
| 1427 | if re.prog[pc2].ist == ist_dot_char { |
| 1428 | return err_syntax_error, 0 |
| 1429 | } |
| 1430 | if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] { |
| 1431 | // println("Next dot char check is PC: ${pc2}") |
| 1432 | re.prog[pc1].dot_check_pc = pc2 |
| 1433 | break |
| 1434 | } |
| 1435 | pc2++ |
| 1436 | } |
| 1437 | } |
| 1438 | pc1++ |
| 1439 | } |
| 1440 | |
| 1441 | // println("last_dot_char_pc: ${last_dot_char_pc}") |
| 1442 | if last_dot_char_pc >= 0 { |
| 1443 | pc1 = last_dot_char_pc + 1 |
| 1444 | mut is_last_dot := true |
| 1445 | for pc1 < pc { |
| 1446 | if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] { |
| 1447 | is_last_dot = false |
| 1448 | break |
| 1449 | } |
| 1450 | pc1++ |
| 1451 | } |
| 1452 | if is_last_dot { |
| 1453 | re.prog[last_dot_char_pc].last_dot_flag = true |
| 1454 | } |
| 1455 | } |
| 1456 | |
| 1457 | // |
| 1458 | // manage bsls_char |
| 1459 | // |
| 1460 | |
| 1461 | // find the checks for bsls, if any... |
| 1462 | pc1 = 0 |
| 1463 | mut bsls_char_count := 0 |
| 1464 | mut last_bsls_char_pc := -1 |
| 1465 | for pc1 < pc { |
| 1466 | if re.prog[pc1].ist == ist_bsls_char { |
| 1467 | // println("bsls_char pc: ${pc1}") |
| 1468 | last_bsls_char_pc = pc1 |
| 1469 | bsls_char_count++ |
| 1470 | mut pc2 := pc1 + 1 |
| 1471 | for pc2 < pc { |
| 1472 | if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] { |
| 1473 | // println("Next bsls check is PC: ${pc2}") |
| 1474 | re.prog[pc1].bsls_check_pc = pc2 |
| 1475 | break |
| 1476 | } |
| 1477 | pc2++ |
| 1478 | } |
| 1479 | } |
| 1480 | pc1++ |
| 1481 | } |
| 1482 | |
| 1483 | // println('last_bsls_char_pc: ${last_bsls_char_pc}') |
| 1484 | if last_bsls_char_pc >= 0 { |
| 1485 | pc1 = last_bsls_char_pc + 1 |
| 1486 | mut is_last_bsls := true |
| 1487 | for pc1 < pc { |
| 1488 | if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] { |
| 1489 | is_last_bsls = false |
| 1490 | break |
| 1491 | } |
| 1492 | pc1++ |
| 1493 | } |
| 1494 | if is_last_bsls { |
| 1495 | re.prog[last_bsls_char_pc].last_dot_flag = true |
| 1496 | } |
| 1497 | } |
| 1498 | |
| 1499 | // |
| 1500 | // manage CC |
| 1501 | // |
| 1502 | pc1 = 0 |
| 1503 | mut cc_char_count := 0 |
| 1504 | mut last_cc_char_pc := -1 |
| 1505 | for pc1 < pc { |
| 1506 | if re.prog[pc1].ist in [u32(ist_char_class_pos), ist_char_class_neg] { |
| 1507 | last_cc_char_pc = pc1 |
| 1508 | cc_char_count++ |
| 1509 | mut pc2 := pc1 + 1 |
| 1510 | for pc2 < pc { |
| 1511 | if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] { |
| 1512 | // println("Next CC check is PC: ${pc2}") |
| 1513 | re.prog[pc1].cc_check_pc = pc2 |
| 1514 | break |
| 1515 | } |
| 1516 | pc2++ |
| 1517 | } |
| 1518 | } |
| 1519 | pc1++ |
| 1520 | } |
| 1521 | |
| 1522 | // println('last_cc_char_pc: ${last_cc_char_pc}') |
| 1523 | if last_cc_char_pc >= 0 { |
| 1524 | pc1 = last_cc_char_pc + 1 |
| 1525 | mut is_last_cc := true |
| 1526 | for pc1 < pc { |
| 1527 | if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] { |
| 1528 | is_last_cc = false |
| 1529 | break |
| 1530 | } |
| 1531 | pc1++ |
| 1532 | } |
| 1533 | if is_last_cc { |
| 1534 | re.prog[last_cc_char_pc].last_dot_flag = true |
| 1535 | } |
| 1536 | } |
| 1537 | |
| 1538 | //****************************************** |
| 1539 | |
| 1540 | // OR branch |
| 1541 | // a|b|cd |
| 1542 | // d exit point |
| 1543 | // a,b,c branches |
| 1544 | // set the jump in the right places |
| 1545 | pc1 = 0 |
| 1546 | for pc1 < pc - 2 { |
| 1547 | // println("Here ${pc1} ${pc-2}") |
| 1548 | // println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}") |
| 1549 | if re.prog[pc1 + 1].ist == ist_or_branch { |
| 1550 | // two consecutive OR are a syntax error |
| 1551 | if re.prog[pc1 + 2].ist == ist_or_branch { |
| 1552 | return err_syntax_error, i |
| 1553 | } |
| 1554 | |
| 1555 | // check for []|[] errors |
| 1556 | if re.prog[pc1].ist == ist_char_class_pos && re.prog[pc1 + 2].ist == ist_char_class_pos { |
| 1557 | return err_invalid_or_with_cc, re.prog[pc1 + 1].source_index |
| 1558 | } |
| 1559 | } |
| 1560 | |
| 1561 | // manange a|b chains like a|(b)|c|d... |
| 1562 | // standard solution |
| 1563 | if re.prog[pc1].ist != ist_or_branch && re.prog[pc1 + 1].ist == ist_or_branch |
| 1564 | && re.prog[pc1 + 2].ist != ist_or_branch { |
| 1565 | re.prog[pc1].next_is_or = true // set that the next token is an OR |
| 1566 | re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump |
| 1567 | |
| 1568 | // match jump, if an OR chain the next token will be an OR token |
| 1569 | mut pc2 := pc1 + 2 |
| 1570 | for pc2 < pc - 1 { |
| 1571 | ist := re.prog[pc2].ist |
| 1572 | if ist == ist_group_start { |
| 1573 | re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1 |
| 1574 | break |
| 1575 | } |
| 1576 | if ist != ist_or_branch { |
| 1577 | re.prog[pc1 + 1].rep_max = pc2 + 1 |
| 1578 | break |
| 1579 | } |
| 1580 | |
| 1581 | pc2++ |
| 1582 | } |
| 1583 | // special case query of few chars, the true can't go on the first instruction |
| 1584 | if re.prog[pc1 + 1].rep_max == pc1 { |
| 1585 | re.prog[pc1 + 1].rep_max = 3 |
| 1586 | } |
| 1587 | // println("Compile OR postproc. [${pc1},OR ${pc1+1},${pc2}]") |
| 1588 | pc1 = pc2 |
| 1589 | continue |
| 1590 | } |
| 1591 | |
| 1592 | pc1++ |
| 1593 | } |
| 1594 | |
| 1595 | //****************************************** |
| 1596 | // DEBUG PRINT REGEX GENERATED CODE |
| 1597 | //****************************************** |
| 1598 | if re.debug > 0 { |
| 1599 | gc := re.get_code() |
| 1600 | re.log_func(gc) |
| 1601 | } |
| 1602 | //****************************************** |
| 1603 | |
| 1604 | return compile_ok, 0 |
| 1605 | } |
| 1606 | |
| 1607 | // get_code return the compiled code as regex string, note: may be different from the source! |
| 1608 | pub fn (re &RE) get_code() string { |
| 1609 | mut pc1 := 0 |
| 1610 | mut res := strings.new_builder(re.cc.len * 2 * re.prog.len) |
| 1611 | res.write_string('========================================\nv RegEx compiler v ${v_regex_version} output:\n') |
| 1612 | |
| 1613 | mut stop_flag := false |
| 1614 | |
| 1615 | for pc1 <= re.prog.len { |
| 1616 | tk := re.prog[pc1] |
| 1617 | res.write_string('PC:${pc1:3d}') |
| 1618 | |
| 1619 | res.write_string(' ist: ') |
| 1620 | res.write_string('${tk.ist:8x}'.replace(' ', '0')) |
| 1621 | res.write_string(' ') |
| 1622 | ist := tk.ist |
| 1623 | if ist == ist_bsls_char { |
| 1624 | res.write_string('[\\${tk.ch:1c}] BSLS') |
| 1625 | if tk.last_dot_flag == true { |
| 1626 | res.write_string(' last!') |
| 1627 | } |
| 1628 | } else if ist == ist_prog_end { |
| 1629 | res.write_string('PROG_END') |
| 1630 | stop_flag = true |
| 1631 | } else if ist == ist_or_branch { |
| 1632 | res.write_string('OR ') |
| 1633 | } else if ist == ist_char_class_pos { |
| 1634 | res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS') |
| 1635 | if tk.last_dot_flag == true { |
| 1636 | res.write_string(' last!') |
| 1637 | } |
| 1638 | } else if ist == ist_char_class_neg { |
| 1639 | res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG') |
| 1640 | if tk.last_dot_flag == true { |
| 1641 | res.write_string(' last!') |
| 1642 | } |
| 1643 | } else if ist == ist_dot_char { |
| 1644 | res.write_string('. DOT_CHAR nx chk: ${tk.dot_check_pc}') |
| 1645 | if tk.last_dot_flag == true { |
| 1646 | res.write_string(' last!') |
| 1647 | } |
| 1648 | } else if ist == ist_group_start { |
| 1649 | res.write_string('( GROUP_START #:${tk.group_id}') |
| 1650 | if tk.group_id == -1 { |
| 1651 | res.write_string(' ?:') |
| 1652 | } else { |
| 1653 | for x in re.group_map.keys() { |
| 1654 | if re.group_map[x] == (tk.group_id + 1) { |
| 1655 | res.write_string(' ?P<${x}>') |
| 1656 | break |
| 1657 | } |
| 1658 | } |
| 1659 | } |
| 1660 | } else if ist == ist_group_end { |
| 1661 | res.write_string(') GROUP_END #:${tk.group_id}') |
| 1662 | } else if ist == ist_simple_char { |
| 1663 | if tk.flag == 0 { |
| 1664 | res.write_string('[${tk.ch:1c}] query_ch') |
| 1665 | } else { |
| 1666 | res.write_string('[0x${tk.ch:02X}]HEXquery_ch') |
| 1667 | } |
| 1668 | } |
| 1669 | |
| 1670 | if tk.rep_max == max_quantifier { |
| 1671 | res.write_string(' {${tk.rep_min:3d},MAX}') |
| 1672 | } else { |
| 1673 | if ist == ist_or_branch { |
| 1674 | res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}') |
| 1675 | } else { |
| 1676 | res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}') |
| 1677 | } |
| 1678 | if tk.greedy == true { |
| 1679 | res.write_string('?') |
| 1680 | } |
| 1681 | } |
| 1682 | |
| 1683 | res.write_string('\n') |
| 1684 | if stop_flag { |
| 1685 | break |
| 1686 | } |
| 1687 | pc1++ |
| 1688 | } |
| 1689 | |
| 1690 | res.write_string('========================================\n') |
| 1691 | return res.str() |
| 1692 | } |
| 1693 | |
| 1694 | // get_query return a string with a reconstruction of the query starting from the regex program code |
| 1695 | pub fn (re &RE) get_query() string { |
| 1696 | mut res := strings.new_builder(re.query.len * 2) |
| 1697 | |
| 1698 | if (re.flag & f_ms) != 0 { |
| 1699 | res.write_string('^') |
| 1700 | } |
| 1701 | |
| 1702 | mut i := 0 |
| 1703 | for i < re.prog.len && re.prog[i].ist != ist_prog_end && re.prog[i].ist != 0 { |
| 1704 | tk := unsafe { &re.prog[i] } |
| 1705 | ch := tk.ist |
| 1706 | |
| 1707 | // GROUP start |
| 1708 | if ch == ist_group_start { |
| 1709 | if re.debug > 0 { |
| 1710 | res.write_string('#${tk.group_id}') |
| 1711 | } |
| 1712 | res.write_string('(') |
| 1713 | |
| 1714 | if tk.group_neg == true { |
| 1715 | res.write_string('?!') // negation group |
| 1716 | } else if tk.group_id == -1 { |
| 1717 | res.write_string('?:') // non capturing group |
| 1718 | } |
| 1719 | |
| 1720 | for x in re.group_map.keys() { |
| 1721 | if re.group_map[x] == (tk.group_id + 1) { |
| 1722 | res.write_string('?P<${x}>') |
| 1723 | break |
| 1724 | } |
| 1725 | } |
| 1726 | |
| 1727 | i++ |
| 1728 | continue |
| 1729 | } |
| 1730 | |
| 1731 | // GROUP end |
| 1732 | if ch == ist_group_end { |
| 1733 | res.write_string(')') |
| 1734 | } |
| 1735 | |
| 1736 | // OR branch |
| 1737 | if ch == ist_or_branch { |
| 1738 | res.write_string('|') |
| 1739 | if re.debug > 0 { |
| 1740 | res.write_string('{${tk.rep_min},${tk.rep_max}}') |
| 1741 | } |
| 1742 | i++ |
| 1743 | continue |
| 1744 | } |
| 1745 | |
| 1746 | // char class |
| 1747 | if ch == ist_char_class_neg || ch == ist_char_class_pos { |
| 1748 | res.write_string('[') |
| 1749 | if ch == ist_char_class_neg { |
| 1750 | res.write_string('^') |
| 1751 | } |
| 1752 | res.write_string('${re.get_char_class(i)}') |
| 1753 | res.write_string(']') |
| 1754 | } |
| 1755 | |
| 1756 | // bsls char |
| 1757 | if ch == ist_bsls_char { |
| 1758 | res.write_string('\\${tk.ch:1c}') |
| 1759 | } |
| 1760 | |
| 1761 | // ist_dot_char |
| 1762 | if ch == ist_dot_char { |
| 1763 | res.write_string('.') |
| 1764 | } |
| 1765 | |
| 1766 | // char alone |
| 1767 | if ch == ist_simple_char { |
| 1768 | if tk.flag == 0 { |
| 1769 | if u8(ch) in bsls_escape_list { |
| 1770 | res.write_string('\\') |
| 1771 | } |
| 1772 | res.write_string('${tk.ch:c}') |
| 1773 | } else { |
| 1774 | res.write_string('\\x${tk.ch:02x}') |
| 1775 | } |
| 1776 | } |
| 1777 | |
| 1778 | // quantifier |
| 1779 | if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false { |
| 1780 | if tk.rep_min == 0 && tk.rep_max == 1 { |
| 1781 | res.write_string('?') |
| 1782 | } else if tk.rep_min == 1 && tk.rep_max == max_quantifier { |
| 1783 | res.write_string('+') |
| 1784 | } else if tk.rep_min == 0 && tk.rep_max == max_quantifier { |
| 1785 | res.write_string('*') |
| 1786 | } else { |
| 1787 | if tk.rep_max == max_quantifier { |
| 1788 | res.write_string('{${tk.rep_min},MAX}') |
| 1789 | } else { |
| 1790 | res.write_string('{${tk.rep_min},${tk.rep_max}}') |
| 1791 | } |
| 1792 | if tk.greedy == true { |
| 1793 | res.write_string('?') |
| 1794 | } |
| 1795 | } |
| 1796 | } |
| 1797 | i++ |
| 1798 | } |
| 1799 | if (re.flag & f_me) != 0 { |
| 1800 | res.write_string('$') |
| 1801 | } |
| 1802 | |
| 1803 | return res.str() |
| 1804 | } |
| 1805 | |
| 1806 | /****************************************************************************** |
| 1807 | * |
| 1808 | * Groups saving utilities |
| 1809 | * |
| 1810 | ******************************************************************************/ |
| 1811 | @[direct_array_access] |
| 1812 | fn (mut re RE) group_continuous_save(g_index int) { |
| 1813 | if re.group_csave_flag == true { |
| 1814 | // continuous save, save until we have space |
| 1815 | |
| 1816 | // init the first element as counter |
| 1817 | if re.group_csave.len == 0 { |
| 1818 | re.group_csave << 0 |
| 1819 | } |
| 1820 | |
| 1821 | gi := g_index >> 1 |
| 1822 | start := re.groups[g_index] |
| 1823 | end := re.groups[g_index + 1] |
| 1824 | |
| 1825 | // check if we are simply increasing the size ot the found group |
| 1826 | if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3] |
| 1827 | && start == re.group_csave[re.group_csave.len - 2] { |
| 1828 | re.group_csave[re.group_csave.len - 1] = end |
| 1829 | return |
| 1830 | } |
| 1831 | |
| 1832 | // otherwise append a new group to the list |
| 1833 | |
| 1834 | // increment counter |
| 1835 | re.group_csave[0]++ |
| 1836 | // save the record |
| 1837 | re.group_csave << (g_index >> 1) // group id |
| 1838 | re.group_csave << re.groups[g_index] // start |
| 1839 | re.group_csave << re.groups[g_index + 1] // end |
| 1840 | } |
| 1841 | } |
| 1842 | |
| 1843 | /****************************************************************************** |
| 1844 | * |
| 1845 | * Matching |
| 1846 | * |
| 1847 | ******************************************************************************/ |
| 1848 | enum Match_state { |
| 1849 | start = 0 |
| 1850 | stop |
| 1851 | end |
| 1852 | new_line |
| 1853 | ist_load // load and execute instruction |
| 1854 | ist_next // go to next instruction |
| 1855 | ist_next_ks // go to next instruction without clenaning the state |
| 1856 | ist_quant_p // match positive ,quantifier check |
| 1857 | ist_quant_n // match negative, quantifier check |
| 1858 | ist_quant_pg // match positive ,group quantifier check |
| 1859 | ist_quant_ng // match negative ,group quantifier check |
| 1860 | } |
| 1861 | |
| 1862 | fn state_str(s Match_state) string { |
| 1863 | match s { |
| 1864 | .start { return 'start' } |
| 1865 | .stop { return 'stop' } |
| 1866 | .end { return 'end' } |
| 1867 | .new_line { return 'new line' } |
| 1868 | .ist_load { return 'ist_load' } |
| 1869 | .ist_next { return 'ist_next' } |
| 1870 | .ist_next_ks { return 'ist_next_ks' } |
| 1871 | .ist_quant_p { return 'ist_quant_p' } |
| 1872 | .ist_quant_n { return 'ist_quant_n' } |
| 1873 | .ist_quant_pg { return 'ist_quant_pg' } |
| 1874 | .ist_quant_ng { return 'ist_quant_ng' } |
| 1875 | } |
| 1876 | } |
| 1877 | |
| 1878 | struct StateObj { |
| 1879 | pub mut: |
| 1880 | group_index int = -1 // group id used to know how many groups are open |
| 1881 | match_flag bool // indicate if we are in a match condition |
| 1882 | match_index int = -1 // index of the last match |
| 1883 | first_match int = -1 // index of the first match |
| 1884 | pc int = -1 // program counter |
| 1885 | i int = -1 // source string index |
| 1886 | char_len int // last char legth |
| 1887 | last_dot_pc int = -1 // last dot chat pc |
| 1888 | } |
| 1889 | |
| 1890 | @[direct_array_access] |
| 1891 | pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) { |
| 1892 | // result status |
| 1893 | mut result := no_match_found // function return |
| 1894 | |
| 1895 | mut ch := rune(0) // examinated char |
| 1896 | mut char_len := 0 // utf8 examinated char len |
| 1897 | mut m_state := Match_state.start // start point for the matcher FSM |
| 1898 | mut src_end := false |
| 1899 | mut last_fnd_pc := -1 |
| 1900 | |
| 1901 | mut state := StateObj{} // actual state |
| 1902 | mut ist := u32(0) // actual instruction |
| 1903 | mut l_ist := u32(0) // last matched instruction |
| 1904 | |
| 1905 | mut step_count := 0 // stats for debug |
| 1906 | mut dbg_line := 0 // count debug line printed |
| 1907 | |
| 1908 | re.reset() |
| 1909 | |
| 1910 | if re.debug > 0 { |
| 1911 | // print header |
| 1912 | mut h_buf := strings.new_builder(32) |
| 1913 | h_buf.write_string('flags: ') |
| 1914 | h_buf.write_string('${re.flag:8x}'.replace(' ', '0')) |
| 1915 | h_buf.write_string('\n') |
| 1916 | sss := h_buf.str() |
| 1917 | re.log_func(sss) |
| 1918 | } |
| 1919 | |
| 1920 | for m_state != .end { |
| 1921 | if state.pc >= 0 && state.pc < re.prog.len { |
| 1922 | ist = re.prog[state.pc].ist |
| 1923 | } else if state.pc >= re.prog.len { |
| 1924 | // println("ERROR!! PC overflow!!") |
| 1925 | return err_internal_error, state.i |
| 1926 | } |
| 1927 | |
| 1928 | //****************************************** |
| 1929 | // DEBUG LOG |
| 1930 | //****************************************** |
| 1931 | if re.debug > 0 { |
| 1932 | mut buf2 := strings.new_builder(re.cc.len + 128) |
| 1933 | |
| 1934 | // print all the instructions |
| 1935 | |
| 1936 | // end of the input text |
| 1937 | if state.i >= in_txt_len { |
| 1938 | buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n') |
| 1939 | sss := buf2.str() |
| 1940 | re.log_func(sss) |
| 1941 | } else { |
| 1942 | // print only the exe instruction |
| 1943 | if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 { |
| 1944 | if ist == ist_prog_end { |
| 1945 | buf2.write_string('# ${step_count:3d} PROG_END\n') |
| 1946 | } else if ist == 0 || m_state in [.start, .ist_next, .stop] { |
| 1947 | buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n') |
| 1948 | } else { |
| 1949 | ch, char_len = re.get_charb(in_txt, state.i) |
| 1950 | |
| 1951 | buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>') |
| 1952 | buf2.write_string('${ist:8x}'.replace(' ', '0')) |
| 1953 | buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ") |
| 1954 | |
| 1955 | if ist == ist_simple_char { |
| 1956 | if re.prog[state.pc].flag == 0 { |
| 1957 | buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]') |
| 1958 | } else { |
| 1959 | buf2.write_string('query_ch: [0x${re.prog[state.pc].ch:02X}]') |
| 1960 | } |
| 1961 | } else { |
| 1962 | if ist == ist_bsls_char { |
| 1963 | buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]') |
| 1964 | } else if ist == ist_prog_end { |
| 1965 | buf2.write_string('PROG_END') |
| 1966 | } else if ist == ist_or_branch { |
| 1967 | buf2.write_string('OR') |
| 1968 | } else if ist == ist_char_class_pos { |
| 1969 | buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]') |
| 1970 | } else if ist == ist_char_class_neg { |
| 1971 | buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]') |
| 1972 | } else if ist == ist_dot_char { |
| 1973 | buf2.write_string('DOT_CHAR') |
| 1974 | } else if ist == ist_group_start { |
| 1975 | tmp_gi := re.prog[state.pc].group_id |
| 1976 | tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep |
| 1977 | buf2.write_string('GROUP_START #:${tmp_gi} rep:${tmp_gr} ') |
| 1978 | } else if ist == ist_group_end { |
| 1979 | buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:${state.group_index}') |
| 1980 | } |
| 1981 | } |
| 1982 | if re.prog[state.pc].rep_max == max_quantifier { |
| 1983 | buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}') |
| 1984 | } else { |
| 1985 | buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}') |
| 1986 | } |
| 1987 | if re.prog[state.pc].greedy == true { |
| 1988 | buf2.write_string('?') |
| 1989 | } |
| 1990 | buf2.write_string(' (#${state.group_index})') |
| 1991 | |
| 1992 | if ist == ist_dot_char { |
| 1993 | buf2.write_string(' last!') |
| 1994 | } |
| 1995 | |
| 1996 | buf2.write_string('\n') |
| 1997 | } |
| 1998 | sss2 := buf2.str() |
| 1999 | re.log_func(sss2) |
| 2000 | } |
| 2001 | } |
| 2002 | step_count++ |
| 2003 | dbg_line++ |
| 2004 | } |
| 2005 | //****************************************** |
| 2006 | |
| 2007 | if ist == ist_prog_end { |
| 2008 | // println("HERE we end!") |
| 2009 | break |
| 2010 | } |
| 2011 | |
| 2012 | // we're out of text, manage it |
| 2013 | if state.i >= in_txt_len || m_state == .new_line { |
| 2014 | // println("Finished text!!") |
| 2015 | src_end = true |
| 2016 | |
| 2017 | // we have fished the text, we must manage out pf bound indexes |
| 2018 | if state.i >= in_txt_len { |
| 2019 | state.i = in_txt_len - 1 |
| 2020 | } |
| 2021 | |
| 2022 | // manage groups |
| 2023 | if state.group_index >= 0 && state.match_index >= 0 { |
| 2024 | // println("End text with open groups!") |
| 2025 | // println("state.group_index: ${state.group_index}") |
| 2026 | // close the groups |
| 2027 | for state.group_index >= 0 { |
| 2028 | tmp_pc := re.group_data[state.group_index] |
| 2029 | re.prog[tmp_pc].group_rep++ |
| 2030 | // println("Closing group ${state.group_index} {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}") |
| 2031 | |
| 2032 | if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min |
| 2033 | && re.prog[tmp_pc].group_id >= 0 { |
| 2034 | start_i := re.group_stack[state.group_index] |
| 2035 | re.group_stack[state.group_index] = -1 |
| 2036 | |
| 2037 | // save group results |
| 2038 | g_index := re.prog[tmp_pc].group_id * 2 |
| 2039 | // println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}") |
| 2040 | if start_i >= 0 { |
| 2041 | re.groups[g_index] = start_i |
| 2042 | } else { |
| 2043 | re.groups[g_index] = 0 |
| 2044 | } |
| 2045 | |
| 2046 | re.groups[g_index + 1] = state.i |
| 2047 | |
| 2048 | if re.groups[g_index + 1] >= in_txt_len { |
| 2049 | // println("clamp group on stop!") |
| 2050 | re.groups[g_index + 1] = in_txt_len - 1 |
| 2051 | } |
| 2052 | |
| 2053 | // continuous save, save until we have space |
| 2054 | re.group_continuous_save(g_index) |
| 2055 | } |
| 2056 | state.group_index-- |
| 2057 | } |
| 2058 | } |
| 2059 | |
| 2060 | // println("re.groups: ${re.groups}") |
| 2061 | |
| 2062 | // the text is finished and the groups closed and we are the last group, ok exit |
| 2063 | if ist == ist_group_end && re.prog[state.pc + 1].ist == ist_prog_end { |
| 2064 | // println("Last group end") |
| 2065 | return state.first_match, state.i |
| 2066 | } |
| 2067 | |
| 2068 | if state.pc == -1 { |
| 2069 | state.pc = last_fnd_pc |
| 2070 | } |
| 2071 | |
| 2072 | // println("Finished text!!") |
| 2073 | // println("Instruction: ${ist:08x} pc: ${state.pc}") |
| 2074 | // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}") |
| 2075 | |
| 2076 | // program end |
| 2077 | if ist == ist_prog_end { |
| 2078 | // println("Program end on end of text!") |
| 2079 | return state.first_match, state.i |
| 2080 | } |
| 2081 | |
| 2082 | if l_ist in [ |
| 2083 | u32(ist_char_class_neg), |
| 2084 | ist_char_class_pos, |
| 2085 | ist_bsls_char, |
| 2086 | ist_dot_char, |
| 2087 | ] { |
| 2088 | // println("***** We have a last special token") |
| 2089 | // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}") |
| 2090 | // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}") |
| 2091 | // println("first match: ${state.first_match}") |
| 2092 | |
| 2093 | if re.prog[state.pc].last_dot_flag == true |
| 2094 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min |
| 2095 | && re.prog[state.pc].rep <= re.prog[state.pc].rep_max { |
| 2096 | return state.first_match, state.i |
| 2097 | } |
| 2098 | // println("Not fitted!!") |
| 2099 | } |
| 2100 | // no groups open, check the last token quantifier |
| 2101 | if ist != ist_group_end && re.prog[state.pc + 1].ist == ist_prog_end { |
| 2102 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_min |
| 2103 | && re.prog[state.pc].rep <= re.prog[state.pc].rep_max { |
| 2104 | // println("We are in good repetition") |
| 2105 | return state.first_match, state.i |
| 2106 | } |
| 2107 | } |
| 2108 | |
| 2109 | // println("No good exit!!") |
| 2110 | if re.prog[re.prog_len - 1].ist == ist_group_end { |
| 2111 | // println("last ist is a group end!") |
| 2112 | if re.prog[re.prog_len - 1].group_rep >= re.prog[re.prog_len - 1].rep_min { |
| 2113 | return state.first_match, state.i |
| 2114 | } |
| 2115 | } |
| 2116 | return no_match_found, state.i |
| 2117 | } |
| 2118 | |
| 2119 | // starting and init |
| 2120 | if m_state == .start { |
| 2121 | state.pc = -1 |
| 2122 | state.i = 0 |
| 2123 | m_state = .ist_next |
| 2124 | continue |
| 2125 | } |
| 2126 | // ist_next, next instruction reseting its state |
| 2127 | else if m_state == .ist_next { |
| 2128 | state.pc = state.pc + 1 |
| 2129 | re.prog[state.pc].reset() |
| 2130 | // check if we are in the program bounds |
| 2131 | if state.pc < 0 || state.pc > re.prog.len { |
| 2132 | // println("ERROR!! PC overflow!!") |
| 2133 | return err_internal_error, state.i |
| 2134 | } |
| 2135 | m_state = .ist_load |
| 2136 | continue |
| 2137 | } |
| 2138 | // ist_next_ks, next instruction keeping its state |
| 2139 | else if m_state == .ist_next_ks { |
| 2140 | state.pc = state.pc + 1 |
| 2141 | // check if we are in the program bounds |
| 2142 | if state.pc < 0 || state.pc > re.prog.len { |
| 2143 | // println("ERROR!! PC overflow!!") |
| 2144 | return err_internal_error, state.i |
| 2145 | } |
| 2146 | m_state = .ist_load |
| 2147 | continue |
| 2148 | } |
| 2149 | |
| 2150 | // load the char |
| 2151 | ch, char_len = re.get_charb(in_txt, state.i) |
| 2152 | |
| 2153 | // check new line if flag f_nl enabled |
| 2154 | if (re.flag & f_nl) != 0 && char_len == 1 && u8(ch) in new_line_list { |
| 2155 | m_state = .new_line |
| 2156 | continue |
| 2157 | } |
| 2158 | // check if stop |
| 2159 | else if m_state == .stop { |
| 2160 | // we are in search mode, don't exit until the end |
| 2161 | if (re.flag & f_src) != 0 && ist != ist_prog_end { |
| 2162 | last_fnd_pc = state.pc |
| 2163 | state.pc = -1 |
| 2164 | state.i += char_len |
| 2165 | |
| 2166 | m_state = .ist_next |
| 2167 | re.reset_src() |
| 2168 | state.match_index = -1 |
| 2169 | state.first_match = -1 |
| 2170 | |
| 2171 | // reset state list |
| 2172 | re.reset() |
| 2173 | |
| 2174 | continue |
| 2175 | } |
| 2176 | |
| 2177 | if ist == ist_prog_end { |
| 2178 | return state.first_match, state.i |
| 2179 | } |
| 2180 | |
| 2181 | // manage here dot char |
| 2182 | |
| 2183 | if re.state_list.len > 0 { |
| 2184 | // println("Here we are, with stop: state buffer: [${re.state_list.len}]") |
| 2185 | state = re.state_list.pop() |
| 2186 | |
| 2187 | state.match_flag = true |
| 2188 | l_ist = u32(ist_dot_char) |
| 2189 | |
| 2190 | if state.first_match < 0 { |
| 2191 | state.first_match = state.i |
| 2192 | } |
| 2193 | state.match_index = state.i |
| 2194 | re.prog[state.pc].rep++ // increase repetitions |
| 2195 | |
| 2196 | state.i += char_len |
| 2197 | m_state = .ist_quant_p |
| 2198 | continue |
| 2199 | } |
| 2200 | |
| 2201 | // exit on no match |
| 2202 | return result, state.i |
| 2203 | } |
| 2204 | // ist_load |
| 2205 | else if m_state == .ist_load { |
| 2206 | // program end |
| 2207 | if ist == ist_prog_end { |
| 2208 | // if we are in match exit well |
| 2209 | |
| 2210 | if state.group_index >= 0 && state.match_index >= 0 { |
| 2211 | state.group_index = -1 |
| 2212 | } |
| 2213 | |
| 2214 | m_state = .stop |
| 2215 | continue |
| 2216 | } |
| 2217 | // check GROUP start, no quantifier is checkd for this token!! |
| 2218 | else if ist == ist_group_start { |
| 2219 | state.group_index++ |
| 2220 | re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape |
| 2221 | re.group_stack[state.group_index] = state.i // index where we start to manage |
| 2222 | // println("group_index ${state.group_index} rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}") |
| 2223 | |
| 2224 | m_state = .ist_next |
| 2225 | continue |
| 2226 | } |
| 2227 | // check GROUP end |
| 2228 | else if ist == ist_group_end { |
| 2229 | // we are in matching streak |
| 2230 | // println("Group END!! last ist: ${l_ist:08x}") |
| 2231 | if state.match_index >= 0 { |
| 2232 | // restore txt index stack and save the group data |
| 2233 | |
| 2234 | // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}") |
| 2235 | if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 { |
| 2236 | start_i := re.group_stack[state.group_index] |
| 2237 | |
| 2238 | // save group results |
| 2239 | g_index := re.prog[state.pc].group_id * 2 |
| 2240 | |
| 2241 | if start_i >= 0 { |
| 2242 | re.groups[g_index] = start_i |
| 2243 | } else { |
| 2244 | re.groups[g_index] = 0 |
| 2245 | } |
| 2246 | |
| 2247 | re.groups[g_index + 1] = state.i |
| 2248 | |
| 2249 | if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] { |
| 2250 | re.groups[g_index] = re.groups[g_index - 1] |
| 2251 | } |
| 2252 | |
| 2253 | if re.groups[g_index + 1] >= in_txt_len { |
| 2254 | // println("clamp group!") |
| 2255 | re.groups[g_index + 1] = in_txt_len - 1 |
| 2256 | } |
| 2257 | |
| 2258 | // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: ${state.i} in_txt_len: ${in_txt_len}") |
| 2259 | |
| 2260 | // continuous save, save until we have space |
| 2261 | re.group_continuous_save(g_index) |
| 2262 | } |
| 2263 | |
| 2264 | re.prog[state.pc].group_rep++ // increase repetitions |
| 2265 | // println("GROUP ${group_index} END ${re.prog[state.pc].group_rep}") |
| 2266 | if re.prog[state.pc].group_rep > in_txt_len - 1 { |
| 2267 | m_state = .ist_quant_ng |
| 2268 | continue |
| 2269 | } |
| 2270 | |
| 2271 | m_state = .ist_quant_pg |
| 2272 | continue |
| 2273 | } |
| 2274 | |
| 2275 | m_state = .ist_quant_ng |
| 2276 | continue |
| 2277 | } |
| 2278 | // check OR |
| 2279 | else if ist == ist_or_branch { |
| 2280 | if state.match_index >= 0 { |
| 2281 | state.pc = re.prog[state.pc].rep_max |
| 2282 | // println("ist_or_branch True pc: ${state.pc}") |
| 2283 | } else { |
| 2284 | state.pc = re.prog[state.pc].rep_min |
| 2285 | // println("ist_or_branch False pc: ${state.pc}") |
| 2286 | } |
| 2287 | re.prog[state.pc].reset() |
| 2288 | m_state = .ist_load |
| 2289 | continue |
| 2290 | } |
| 2291 | // check ist_dot_char |
| 2292 | else if ist == ist_dot_char { |
| 2293 | // println("ist_dot_char rep: ${re.prog[state.pc].rep}") |
| 2294 | |
| 2295 | // check next token to be false |
| 2296 | mut next_check_flag := false |
| 2297 | |
| 2298 | // if we are done with max go on dot char are dedicated case!! |
| 2299 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { |
| 2300 | re.state_list.pop() |
| 2301 | m_state = .ist_next |
| 2302 | continue |
| 2303 | } |
| 2304 | |
| 2305 | if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].dot_check_pc >= 0 |
| 2306 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { |
| 2307 | // load the char |
| 2308 | // ch_t, _ := re.get_charb(in_txt, state.i+char_len) |
| 2309 | ch_t := ch |
| 2310 | chk_pc := re.prog[state.pc].dot_check_pc |
| 2311 | next_check_flag = re.token_matches(chk_pc, ch_t) |
| 2312 | } |
| 2313 | |
| 2314 | // check if we must continue or pass to the next IST |
| 2315 | if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end { |
| 2316 | // println("save the state!!") |
| 2317 | mut dot_state := StateObj{ |
| 2318 | group_index: state.group_index |
| 2319 | match_flag: state.match_flag |
| 2320 | match_index: state.match_index |
| 2321 | first_match: state.first_match |
| 2322 | pc: state.pc |
| 2323 | i: state.i + char_len |
| 2324 | char_len: char_len |
| 2325 | last_dot_pc: state.pc |
| 2326 | } |
| 2327 | // if we are mananging a .* stay on the same char on return |
| 2328 | if re.prog[state.pc].rep_min == 0 { |
| 2329 | dot_state.i -= char_len |
| 2330 | } |
| 2331 | |
| 2332 | re.state_list << dot_state |
| 2333 | |
| 2334 | m_state = .ist_quant_n |
| 2335 | // println("dot_char stack len: ${re.state_list.len}") |
| 2336 | continue |
| 2337 | } |
| 2338 | |
| 2339 | state.match_flag = true |
| 2340 | l_ist = u32(ist_dot_char) |
| 2341 | |
| 2342 | if state.first_match < 0 { |
| 2343 | state.first_match = state.i |
| 2344 | } |
| 2345 | state.match_index = state.i |
| 2346 | re.prog[state.pc].rep++ // increase repetitions |
| 2347 | |
| 2348 | state.i += char_len |
| 2349 | m_state = .ist_quant_p |
| 2350 | continue |
| 2351 | } |
| 2352 | // char class IST |
| 2353 | else if ist == ist_char_class_pos || ist == ist_char_class_neg { |
| 2354 | // check next token to be false |
| 2355 | mut next_check_flag := false |
| 2356 | |
| 2357 | // if we are done with max go on dot char are dedicated case!! |
| 2358 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { |
| 2359 | re.state_list.pop() |
| 2360 | m_state = .ist_next |
| 2361 | continue |
| 2362 | } |
| 2363 | |
| 2364 | if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].cc_check_pc >= 0 |
| 2365 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { |
| 2366 | // load the char |
| 2367 | // ch_t, _ := re.get_charb(in_txt, state.i+char_len) |
| 2368 | ch_t := ch |
| 2369 | chk_pc := re.prog[state.pc].cc_check_pc |
| 2370 | next_check_flag = re.token_matches(chk_pc, ch_t) |
| 2371 | } |
| 2372 | |
| 2373 | // check if we must continue or pass to the next IST |
| 2374 | if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end { |
| 2375 | // println("save the state!!") |
| 2376 | mut dot_state := StateObj{ |
| 2377 | group_index: state.group_index |
| 2378 | match_flag: state.match_flag |
| 2379 | match_index: state.match_index |
| 2380 | first_match: state.first_match |
| 2381 | pc: state.pc |
| 2382 | i: state.i + char_len |
| 2383 | char_len: char_len |
| 2384 | last_dot_pc: state.pc |
| 2385 | } |
| 2386 | // if we are managing a \[something]* stay on the same char on return |
| 2387 | if re.prog[state.pc].rep_min == 0 { |
| 2388 | dot_state.i -= char_len |
| 2389 | } |
| 2390 | |
| 2391 | re.state_list << dot_state |
| 2392 | |
| 2393 | m_state = .ist_quant_n |
| 2394 | // println("dot_char stack len: ${re.state_list.len}") |
| 2395 | continue |
| 2396 | } |
| 2397 | |
| 2398 | // println("HERE WE MUST STAY! ${state.i} >= ${in_txt_len}") |
| 2399 | |
| 2400 | state.match_flag = false |
| 2401 | mut cc_neg := false |
| 2402 | |
| 2403 | if ist == ist_char_class_neg { |
| 2404 | cc_neg = true |
| 2405 | } |
| 2406 | mut cc_res := re.check_char_class(state.pc, ch) |
| 2407 | |
| 2408 | // manage out of text on char class parse |
| 2409 | if state.i >= (in_txt_len - 1) && cc_neg && re.prog[state.pc].last_dot_flag { |
| 2410 | m_state = .ist_quant_n |
| 2411 | continue |
| 2412 | } |
| 2413 | |
| 2414 | if cc_neg { |
| 2415 | cc_res = !cc_res |
| 2416 | } |
| 2417 | |
| 2418 | if cc_res { |
| 2419 | state.match_flag = true |
| 2420 | l_ist = u32(ist_char_class_pos) |
| 2421 | |
| 2422 | if state.first_match < 0 { |
| 2423 | state.first_match = state.i |
| 2424 | } |
| 2425 | |
| 2426 | state.match_index = state.i |
| 2427 | |
| 2428 | re.prog[state.pc].rep++ // increase repetitions |
| 2429 | state.i += char_len // next char |
| 2430 | m_state = .ist_quant_p |
| 2431 | continue |
| 2432 | } |
| 2433 | m_state = .ist_quant_n |
| 2434 | continue |
| 2435 | } |
| 2436 | // check bsls |
| 2437 | else if ist == ist_bsls_char { |
| 2438 | // println("ist_bsls_char rep: ${re.prog[state.pc].rep}") |
| 2439 | |
| 2440 | // check next token to be false |
| 2441 | mut next_check_flag := false |
| 2442 | |
| 2443 | // if we are done with max go on dot char are dedicated case!! |
| 2444 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { |
| 2445 | re.state_list.pop() |
| 2446 | m_state = .ist_next |
| 2447 | continue |
| 2448 | } |
| 2449 | |
| 2450 | if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].bsls_check_pc >= 0 |
| 2451 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { |
| 2452 | // load the char |
| 2453 | // ch_t, _ := re.get_charb(in_txt, state.i+char_len) |
| 2454 | ch_t := ch |
| 2455 | chk_pc := re.prog[state.pc].bsls_check_pc |
| 2456 | next_check_flag = re.token_matches(chk_pc, ch_t) |
| 2457 | } |
| 2458 | |
| 2459 | // check if we must continue or pass to the next IST |
| 2460 | if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end { |
| 2461 | // println("save the state!!") |
| 2462 | mut dot_state := StateObj{ |
| 2463 | group_index: state.group_index |
| 2464 | match_flag: state.match_flag |
| 2465 | match_index: state.match_index |
| 2466 | first_match: state.first_match |
| 2467 | pc: state.pc |
| 2468 | i: state.i + char_len |
| 2469 | char_len: char_len |
| 2470 | last_dot_pc: state.pc |
| 2471 | } |
| 2472 | // if we are managing a \[something]* stay on the same char on return |
| 2473 | if re.prog[state.pc].rep_min == 0 { |
| 2474 | dot_state.i -= char_len |
| 2475 | } |
| 2476 | |
| 2477 | re.state_list << dot_state |
| 2478 | |
| 2479 | m_state = .ist_quant_n |
| 2480 | // println("dot_char stack len: ${re.state_list.len}") |
| 2481 | continue |
| 2482 | } |
| 2483 | |
| 2484 | tmp_res := re.validator_matches(re.prog[state.pc].validator, ch) |
| 2485 | if tmp_res == false { |
| 2486 | m_state = .ist_quant_n |
| 2487 | continue |
| 2488 | } |
| 2489 | // println("${ch} => ${tmp_res}") |
| 2490 | |
| 2491 | state.match_flag = true |
| 2492 | l_ist = u32(ist_dot_char) |
| 2493 | |
| 2494 | if state.first_match < 0 { |
| 2495 | state.first_match = state.i |
| 2496 | } |
| 2497 | state.match_index = state.i |
| 2498 | re.prog[state.pc].rep++ // increase repetitions |
| 2499 | |
| 2500 | state.i += char_len |
| 2501 | m_state = .ist_quant_p |
| 2502 | continue |
| 2503 | } |
| 2504 | // simple char IST |
| 2505 | else if ist == ist_simple_char { |
| 2506 | // println("ist_simple_char") |
| 2507 | state.match_flag = false |
| 2508 | |
| 2509 | if re.chars_equal(re.prog[state.pc].ch, ch) |
| 2510 | && (state.i < in_txt_len - 1 || re.prog[state.pc].ch != 0) { |
| 2511 | state.match_flag = true |
| 2512 | l_ist = ist_simple_char |
| 2513 | |
| 2514 | if state.first_match < 0 { |
| 2515 | state.first_match = state.i |
| 2516 | } |
| 2517 | // println("state.match_index: ${state.match_index}") |
| 2518 | state.match_index = state.i |
| 2519 | |
| 2520 | re.prog[state.pc].rep++ // increase repetitions |
| 2521 | state.i += char_len // next char |
| 2522 | m_state = .ist_quant_p |
| 2523 | continue |
| 2524 | } |
| 2525 | m_state = .ist_quant_n |
| 2526 | continue |
| 2527 | } |
| 2528 | // UNREACHABLE |
| 2529 | // println("PANIC2!! state: ${m_state}") |
| 2530 | return err_internal_error, state.i |
| 2531 | } |
| 2532 | /*********************************** |
| 2533 | * Quantifier management |
| 2534 | ***********************************/ |
| 2535 | // ist_quant_ng => quantifier negative test on group |
| 2536 | else if m_state == .ist_quant_ng { |
| 2537 | // we are finished here |
| 2538 | if state.group_index < 0 { |
| 2539 | // println("Early stop!") |
| 2540 | result = no_match_found |
| 2541 | m_state = .stop |
| 2542 | continue |
| 2543 | } |
| 2544 | |
| 2545 | tmp_pc := re.group_data[state.group_index] // PC to the end of the group token |
| 2546 | rep := re.prog[tmp_pc].group_rep // use a temp variable |
| 2547 | re.prog[tmp_pc].group_rep = 0 // clear the repetitions |
| 2548 | |
| 2549 | // println(".ist_quant_ng group_pc_end: ${tmp_pc} rep: ${rep}") |
| 2550 | |
| 2551 | if rep >= re.prog[tmp_pc].rep_min { |
| 2552 | // println("ist_quant_ng GROUP CLOSED OK group_index: ${state.group_index}") |
| 2553 | |
| 2554 | state.i = re.group_stack[state.group_index] |
| 2555 | state.pc = tmp_pc |
| 2556 | state.group_index-- |
| 2557 | m_state = .ist_next |
| 2558 | continue |
| 2559 | } else if re.prog[tmp_pc].next_is_or { |
| 2560 | // println("ist_quant_ng OR Negative branch") |
| 2561 | |
| 2562 | state.i = re.group_stack[state.group_index] |
| 2563 | state.pc = re.prog[tmp_pc + 1].rep_min - 1 |
| 2564 | state.group_index-- |
| 2565 | m_state = .ist_next |
| 2566 | continue |
| 2567 | } else if rep > 0 && rep < re.prog[tmp_pc].rep_min { |
| 2568 | // println("ist_quant_ng UNDER THE MINIMUM g.i: ${state.group_index}") |
| 2569 | |
| 2570 | // check if we are inside a group, if yes exit from the nested groups |
| 2571 | if state.group_index > 0 { |
| 2572 | state.group_index-- |
| 2573 | state.pc = tmp_pc |
| 2574 | m_state = .ist_quant_ng //.ist_next |
| 2575 | continue |
| 2576 | } |
| 2577 | |
| 2578 | if state.group_index == 0 { |
| 2579 | state.group_index-- |
| 2580 | state.pc = tmp_pc // TEST |
| 2581 | m_state = .ist_next |
| 2582 | continue |
| 2583 | } |
| 2584 | |
| 2585 | result = no_match_found |
| 2586 | m_state = .stop |
| 2587 | continue |
| 2588 | } else if rep == 0 && rep < re.prog[tmp_pc].rep_min { |
| 2589 | // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: ${state.group_index}") |
| 2590 | |
| 2591 | if state.group_index > 0 { |
| 2592 | state.group_index-- |
| 2593 | state.pc = tmp_pc |
| 2594 | m_state = .ist_quant_ng //.ist_next |
| 2595 | continue |
| 2596 | } |
| 2597 | |
| 2598 | result = no_match_found |
| 2599 | m_state = .stop |
| 2600 | continue |
| 2601 | } |
| 2602 | |
| 2603 | // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${rep}") |
| 2604 | // UNREACHABLE |
| 2605 | return err_internal_error, state.i |
| 2606 | } |
| 2607 | // ist_quant_pg => quantifier positive test on group |
| 2608 | else if m_state == .ist_quant_pg { |
| 2609 | // println(".ist_quant_pg") |
| 2610 | mut tmp_pc := state.pc |
| 2611 | if state.group_index >= 0 { |
| 2612 | tmp_pc = re.group_data[state.group_index] |
| 2613 | } |
| 2614 | |
| 2615 | if re.prog[tmp_pc].group_neg == true { |
| 2616 | // println("***** Negation of the group") |
| 2617 | result = no_match_found |
| 2618 | m_state = .stop |
| 2619 | continue |
| 2620 | } |
| 2621 | |
| 2622 | rep := re.prog[tmp_pc].group_rep |
| 2623 | |
| 2624 | if rep < re.prog[tmp_pc].rep_min { |
| 2625 | // println("ist_quant_pg UNDER RANGE") |
| 2626 | state.pc = re.prog[tmp_pc].goto_pc |
| 2627 | m_state = .ist_next |
| 2628 | continue |
| 2629 | } else if rep == re.prog[tmp_pc].rep_max { |
| 2630 | // println("ist_quant_pg MAX RANGE") |
| 2631 | re.prog[tmp_pc].group_rep = 0 // clear the repetitions |
| 2632 | state.group_index-- |
| 2633 | m_state = .ist_next |
| 2634 | |
| 2635 | continue |
| 2636 | } else if rep >= re.prog[tmp_pc].rep_min { |
| 2637 | // println("ist_quant_pg IN RANGE group_index:${state.group_index}") |
| 2638 | |
| 2639 | // check greedy flag, if true exit on minimum |
| 2640 | if re.prog[tmp_pc].greedy == true { |
| 2641 | re.prog[tmp_pc].group_rep = 0 // clear the repetitions |
| 2642 | state.group_index-- |
| 2643 | m_state = .ist_next |
| 2644 | continue |
| 2645 | } |
| 2646 | |
| 2647 | state.pc = re.prog[tmp_pc].goto_pc - 1 |
| 2648 | state.group_index-- |
| 2649 | m_state = .ist_next |
| 2650 | continue |
| 2651 | } |
| 2652 | |
| 2653 | // UNREACHABLE |
| 2654 | // println("PANIC3!! state: ${m_state}") |
| 2655 | return err_internal_error, state.i |
| 2656 | } |
| 2657 | // ist_quant_n => quantifier negative test on token |
| 2658 | else if m_state == .ist_quant_n { |
| 2659 | rep := re.prog[state.pc].rep |
| 2660 | // println("Here!! PC ${state.pc} is_next_or: ${re.prog[state.pc].next_is_or}") |
| 2661 | |
| 2662 | // zero quantifier * or ? |
| 2663 | if rep == 0 && re.prog[state.pc].rep_min == 0 { |
| 2664 | // println("ist_quant_n c_zero RANGE MIN") |
| 2665 | m_state = .ist_next // go to next ist |
| 2666 | continue |
| 2667 | } |
| 2668 | // match + or * |
| 2669 | else if rep >= re.prog[state.pc].rep_min { |
| 2670 | // println("ist_quant_n MATCH RANGE") |
| 2671 | m_state = .ist_next |
| 2672 | continue |
| 2673 | } |
| 2674 | |
| 2675 | // check the OR if present |
| 2676 | if re.prog[state.pc].next_is_or { |
| 2677 | // println("OR present on failing") |
| 2678 | state.match_index = -1 |
| 2679 | m_state = .ist_next |
| 2680 | continue |
| 2681 | } |
| 2682 | |
| 2683 | // we are in a group manage no match from here |
| 2684 | if state.group_index >= 0 { |
| 2685 | // println("ist_quant_n FAILED insied a GROUP group_index:${state.group_index}") |
| 2686 | m_state = .ist_quant_ng |
| 2687 | continue |
| 2688 | } |
| 2689 | |
| 2690 | // no other options |
| 2691 | // println("ist_quant_n no_match_found") |
| 2692 | result = no_match_found |
| 2693 | m_state = .stop |
| 2694 | |
| 2695 | // stop already started matching outside a capturing group |
| 2696 | if re.state_list.len > 0 && re.state_list.last().group_index == -1 |
| 2697 | && re.state_list.last().last_dot_pc > 0 { |
| 2698 | if ist == ist_dot_char || ist == ist_bsls_char { |
| 2699 | return no_match_found, 0 |
| 2700 | } |
| 2701 | } |
| 2702 | continue |
| 2703 | } |
| 2704 | // ist_quant_p => quantifier positive test on token |
| 2705 | else if m_state == .ist_quant_p { |
| 2706 | // println("Here .ist_quant_p") |
| 2707 | // exit on first match |
| 2708 | if (re.flag & f_efm) != 0 { |
| 2709 | return state.i, state.i + 1 |
| 2710 | } |
| 2711 | |
| 2712 | rep := re.prog[state.pc].rep |
| 2713 | // println("ist_quant_p rep: ${rep} rep_min: ${re.prog[state.pc].rep_min}") |
| 2714 | |
| 2715 | // under range |
| 2716 | if rep > 0 && rep < re.prog[state.pc].rep_min { |
| 2717 | // println("ist_quant_p UNDER RANGE") |
| 2718 | m_state = .ist_load // continue the loop |
| 2719 | continue |
| 2720 | } |
| 2721 | // range ok, continue loop |
| 2722 | else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max { |
| 2723 | // println("ist_quant_p IN RANGE") |
| 2724 | |
| 2725 | // check greedy flag, if true exit on minimum |
| 2726 | if re.prog[state.pc].greedy == true { |
| 2727 | m_state = .ist_next |
| 2728 | continue |
| 2729 | } |
| 2730 | m_state = .ist_load |
| 2731 | continue |
| 2732 | } |
| 2733 | // max reached |
| 2734 | else if rep == re.prog[state.pc].rep_max { |
| 2735 | // println("ist_quant_p MAX RANGE") |
| 2736 | m_state = .ist_next |
| 2737 | continue |
| 2738 | } |
| 2739 | } |
| 2740 | // UNREACHABLE |
| 2741 | // println("PANIC4!! state: ${m_state}") |
| 2742 | return err_internal_error, state.i |
| 2743 | } |
| 2744 | |
| 2745 | // println("Check end of text!") |
| 2746 | // Check the results |
| 2747 | if ist == ist_prog_end && state.first_match < 0 { |
| 2748 | return state.i, state.i |
| 2749 | } |
| 2750 | if state.match_index >= 0 { |
| 2751 | if state.group_index < 0 { |
| 2752 | if re.prog[state.pc].ist == ist_prog_end { |
| 2753 | // println("program ended!!") |
| 2754 | |
| 2755 | if (re.flag & f_src) != 0 { |
| 2756 | // println("find return") |
| 2757 | return state.first_match, state.i |
| 2758 | } else { |
| 2759 | // println("Here!!") |
| 2760 | return 0, state.i |
| 2761 | } |
| 2762 | } |
| 2763 | |
| 2764 | // println("No Group here, natural end [${state.first_match},${state.i}] state: ${state_str(m_state)} ist: ${ist} pgr_end: ${re.prog.len}") |
| 2765 | |
| 2766 | if re.prog[state.pc + 1].ist == ist_prog_end || re.prog[state.pc].ist == ist_prog_end { |
| 2767 | rep := re.prog[state.pc].rep |
| 2768 | // println("rep: ${rep} re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}") |
| 2769 | if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max { |
| 2770 | return state.first_match, state.i |
| 2771 | } |
| 2772 | // println("Program not finished! ") |
| 2773 | return no_match_found, state.i |
| 2774 | } |
| 2775 | if src_end { |
| 2776 | // println("program end") |
| 2777 | return state.first_match, state.i |
| 2778 | } |
| 2779 | // print("No match found!!") |
| 2780 | return no_match_found, state.i |
| 2781 | } else { |
| 2782 | // println("Group match! OK") |
| 2783 | // println("first_match: ${state.first_match}, i: ${state.i}") |
| 2784 | |
| 2785 | // println("Skip last group") |
| 2786 | return state.first_match, state.i |
| 2787 | // return state.first_match,re.group_stack[state.group_index--] |
| 2788 | } |
| 2789 | } |
| 2790 | // println("no_match_found, natural end") |
| 2791 | return no_match_found, state.i |
| 2792 | } |
| 2793 | |