v / vlib / regex / regex.v
2792 lines · 2441 sloc · 70.96 KB · e2e5cf8db56f3562c7baa735061690be936bdf3e
Raw
1/*
2regex 1.0 alpha
3
4Copyright (c) 2019-2024 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8This file contains regex module
9
10Know limitation:
11- find is implemented in a trivial way
12- not full compliant PCRE
13- not compliant POSIX ERE
14*/
15module regex
16
17import strings
18
19pub const v_regex_version = '1.0 alpha' // regex module version
20pub const max_code_len = 256 // default small base code len for the regex programs
21pub const max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
22
23// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
24pub const spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
25// new line chars for now only '\n'
26pub const new_line_list = [`\n`, `\r`]
27
28// Results
29pub const no_match_found = -1
30
31// Errors
32pub const compile_ok = 0 // the regex string compiled, all ok
33pub const err_char_unknown = -2 // the char used is unknow to the system
34pub const err_undefined = -3 // the compiler symbol is undefined
35pub const err_internal_error = -4 // Bug in the regex system!!
36pub const err_cc_alloc_overflow = -5 // memory for char class full!!
37pub const err_syntax_error = -6 // syntax error in regex compiling
38pub const err_groups_overflow = -7 // max number of groups reached
39pub const err_groups_max_nested = -8 // max number of nested group reached
40pub const err_group_not_balanced = -9 // group not balanced
41pub const err_group_qm_notation = -10 // group invalid notation
42pub const err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
43pub const err_neg_group_quantifier = -12 // negation groups can not have quantifier
44pub const err_consecutive_dots = -13
45
46//*************************************
47// regex program instructions
48//*************************************
49const ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
50
51// char class 11 0100 AA xxxxxxxx
52// AA = 00 regular class
53// AA = 01 Negated class ^ char
54const ist_char_class = u32(0xD1000000) // MASK
55const ist_char_class_pos = u32(0xD0000000) // char class normal [abc]
56const ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]
57
58// dot char 10 0110 xx xxxxxxxx
59const ist_dot_char = u32(0x98000000) // match any char except \n
60
61// backslash chars 10 0100 xx xxxxxxxx
62const ist_bsls_char = u32(0x90000000) // backslash char
63
64// OR | 10 010Y xx xxxxxxxx
65const ist_or_branch = u32(0x91000000) // OR case
66
67// groups 10 010Y xx xxxxxxxx
68const ist_group_start = u32(0x92000000) // group start (
69const ist_group_end = u32(0x94000000) // group end )
70
71// control instructions
72const ist_prog_end = u32(0x88000000)
73
74/*
75General Utilities
76*/
77// utf8util_char_len calculate the length in bytes of a utf8 char
78@[inline]
79fn utf8util_char_len(b u8) int {
80 return int(((u32(0xe5000000) >> ((b >> 3) & 0x1e)) & 3) + 1)
81}
82
83// get_char get a char from position i and return an u32 with the unicode code
84@[direct_array_access; inline]
85fn (re &RE) get_char(in_txt string, i int) (u32, int) {
86 ini := unsafe { in_txt.str[i] }
87 // ascii 8 bit
88 if (re.flag & f_bin) != 0 || ini & 0x80 == 0 {
89 return u32(ini), 1
90 }
91 // unicode char
92 char_len := utf8util_char_len(ini)
93 mut tmp := 0
94 mut ch := u32(0)
95 for tmp < char_len {
96 ch = (ch << 8) | unsafe { in_txt.str[i + tmp] }
97 tmp++
98 }
99 return ch, char_len
100}
101
102// get_charb get a char from position i and return an u32 with the unicode code
103@[direct_array_access; inline]
104fn (re &RE) get_charb(in_txt &u8, i int) (u32, int) {
105 // ascii 8 bit
106 if (re.flag & f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 {
107 return u32(unsafe { in_txt[i] }), 1
108 }
109 // unicode char
110 char_len := utf8util_char_len(unsafe { in_txt[i] })
111 mut tmp := 0
112 mut ch := u32(0)
113 for tmp < char_len {
114 ch = (ch << 8) | unsafe { in_txt[i + tmp] }
115 tmp++
116 }
117 return ch, char_len
118}
119
120@[inline]
121fn is_alnum(in_char u8) bool {
122 mut tmp := in_char - `A`
123 if tmp <= 25 {
124 return true
125 }
126 tmp = in_char - `a`
127 if tmp <= 25 {
128 return true
129 }
130 tmp = in_char - `0`
131 if tmp <= 9 {
132 return true
133 }
134 if in_char == `_` {
135 return true
136 }
137 return false
138}
139
140@[inline]
141fn is_not_alnum(in_char u8) bool {
142 return !is_alnum(in_char)
143}
144
145@[inline]
146fn is_space(in_char u8) bool {
147 return in_char in spaces
148}
149
150@[inline]
151fn is_not_space(in_char u8) bool {
152 return !is_space(in_char)
153}
154
155@[inline]
156fn is_digit(in_char u8) bool {
157 tmp := in_char - `0`
158 return tmp <= 0x09
159}
160
161@[inline]
162fn is_not_digit(in_char u8) bool {
163 return !is_digit(in_char)
164}
165
166/*
167@[inline]
168fn is_wordchar(in_char byte) bool {
169 return is_alnum(in_char) || in_char == `_`
170}
171
172@[inline]
173fn is_not_wordchar(in_char byte) bool {
174 return !is_alnum(in_char)
175}
176*/
177
178@[inline]
179fn is_lower(in_char u8) bool {
180 tmp := in_char - `a`
181 return tmp <= 25
182}
183
184@[inline]
185fn is_upper(in_char u8) bool {
186 tmp := in_char - `A`
187 return tmp <= 25
188}
189
190pub fn (re &RE) get_parse_error_string(err int) string {
191 match err {
192 compile_ok { return 'compile_ok' }
193 no_match_found { return 'no_match_found' }
194 err_char_unknown { return 'err_char_unknown' }
195 err_undefined { return 'err_undefined' }
196 err_internal_error { return 'err_internal_error' }
197 err_cc_alloc_overflow { return 'err_cc_alloc_overflow' }
198 err_syntax_error { return 'err_syntax_error' }
199 err_groups_overflow { return 'err_groups_overflow' }
200 err_groups_max_nested { return 'err_groups_max_nested' }
201 err_group_not_balanced { return 'err_group_not_balanced' }
202 err_group_qm_notation { return 'err_group_qm_notation' }
203 err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
204 err_neg_group_quantifier { return 'err_neg_group_quantifier' }
205 err_consecutive_dots { return 'err_consecutive_dots' }
206 else { return 'err_unknown' }
207 }
208}
209
210// utf8_str convert and utf8 sequence to a printable string
211@[inline]
212fn utf8_str(ch rune) string {
213 mut i := 4
214 mut res := ''
215 for i > 0 {
216 v := u8((ch >> ((i - 1) * 8)) & 0xFF)
217 if v != 0 {
218 res += '${v:1c}'
219 }
220 i--
221 }
222 return res
223}
224
225// simple_log default log function
226fn simple_log(txt string) {
227 print(txt)
228}
229
230/******************************************************************************
231*
232* Token Structs
233*
234******************************************************************************/
235pub type FnValidator = fn (u8) bool
236
237struct Token {
238mut:
239 ist u32
240 // char
241 ch rune // char of the token if any
242 ch_len u8 // char len
243 flag u8 // flag for general usage
244 // Quantifiers / branch
245 rep_min int // used also for jump next in the OR branch [no match] pc jump
246 rep_max int // used also for jump next in the OR branch [ match] pc jump
247 greedy bool // greedy quantifier flag
248 // Char class
249 cc_index int = -1
250 // counters for quantifier check (repetitions)
251 rep int
252 // validator function pointer
253 validator FnValidator = unsafe { nil }
254 // groups variables
255 group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
256 group_rep int // repetition of the group
257 group_id int = -1 // id of the group
258 goto_pc int = -1 // jump to this PC if is needed
259 // OR flag for the token
260 next_is_or bool // true if the next token is an OR
261 // dot_char token variables
262 dot_check_pc int = -1 // pc of the next token to check for dots
263 bsls_check_pc int = -1 // pc of the next token to check for bsls
264 cc_check_pc int = -1 // pc of the next token to check for CC
265 last_dot_flag bool // if true indicate that is the last dot_char in the regex
266 // debug fields
267 source_index int
268}
269
270@[inline]
271fn (mut tok Token) reset() {
272 tok.rep = 0
273}
274
275/******************************************************************************
276*
277* Regex struct
278*
279******************************************************************************/
280pub const f_nl = 0x00000001 // end the match when find a new line symbol
281pub const f_ms = 0x00000002 // match true only if the match is at the start of the string
282pub const f_me = 0x00000004 // match true only if the match is at the end of the string
283pub const f_efm = 0x00000100 // exit on first token matched, used by search
284pub const f_bin = 0x00000200 // work only on bytes, ignore utf-8
285pub const f_ci = 0x00000400 // compare ASCII letters without case sensitivity
286
287// behaviour modifier flags
288pub const f_src = 0x00020000
289
290// Log function prototype
291pub type FnLog = fn (string)
292
293pub struct RE {
294pub mut:
295 prog []Token
296 prog_len int // regex program len
297 // char classes storage
298 cc []CharClass // char class list
299 cc_index int // index
300 // groups
301 group_count int // number of groups in this regex struct
302 groups []int // groups index results
303 group_max_nested int = 3 // max nested group
304 group_max int = 8 // max allowed number of different groups
305
306 state_list []StateObj
307
308 group_csave_flag bool // flag to enable continuous saving
309 group_csave []int //= []int{} // groups continuous save list
310
311 group_map map[string]int // groups names map
312
313 group_stack []int
314 group_data []int
315 // flags
316 flag int // flag for optional parameters
317 // Debug/log
318 debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
319 log_func FnLog = simple_log // log function, can be customized by the user
320 query string // query string
321}
322
323// Reset RE object
324@[direct_array_access; inline]
325pub fn (mut re RE) reset() {
326 re.cc_index = 0
327
328 mut i := 0
329 for i < re.prog_len {
330 re.prog[i].group_rep = 0 // clear repetition of the group
331 re.prog[i].rep = 0 // clear repetition of the token
332 i++
333 }
334
335 // init groups array
336 if re.group_count > 0 {
337 if re.groups.len == 0 {
338 // first run alloc memory
339 re.groups = []int{len: re.group_count * 2, init: -1}
340 } else {
341 // subsequent executions, only clean up the memory
342 i = 0
343 for i < re.groups.len {
344 re.groups[i] = -1
345 i++
346 }
347 }
348 }
349
350 // reset group_csave
351 if re.group_csave_flag == true {
352 re.group_csave.clear() // = []int{}
353 }
354
355 // reset state list
356 re.state_list.clear()
357 // restore initial state of the stack
358 for mut x in re.group_stack {
359 x = -1
360 }
361}
362
363// reset for search mode fail
364// gcc bug, dont use @[inline] or go 5 time slower
365//[inline]
366@[direct_array_access]
367fn (mut re RE) reset_src() {
368 mut i := 0
369 for i < re.prog_len {
370 re.prog[i].group_rep = 0 // clear repetition of the group
371 re.prog[i].rep = 0 // clear repetition of the token
372 i++
373 }
374}
375
376/******************************************************************************
377*
378* Backslashes chars
379*
380******************************************************************************/
381struct BslsStruct {
382 ch rune // meta char
383 validator FnValidator = unsafe { nil } // validator function pointer
384}
385
386const bsls_validator_array = [
387 BslsStruct{`w`, is_alnum},
388 BslsStruct{`W`, is_not_alnum},
389 BslsStruct{`s`, is_space},
390 BslsStruct{`S`, is_not_space},
391 BslsStruct{`d`, is_digit},
392 BslsStruct{`D`, is_not_digit},
393 BslsStruct{`a`, is_lower},
394 BslsStruct{`A`, is_upper},
395]
396
397// these chars are escape if preceded by a \
398const bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`,
399 `^`, `!`]
400
401enum BSLS_parse_state {
402 start
403 bsls_found
404 bsls_char
405 normal_char
406 hex_char
407}
408
409// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
410fn (re &RE) parse_bsls(in_txt string, in_i int) (int, int, u32) {
411 mut status := BSLS_parse_state.start
412 mut i := in_i
413 mut hex_max_len := 2
414 mut hex_res := u32(0)
415 mut hex_count := 0
416
417 for i < in_txt.len {
418 // get our char
419 char_tmp, char_len := re.get_char(in_txt, i)
420 ch := u8(char_tmp)
421 // println("ch [${ch:c}]")
422
423 if status == .start && ch == `\\` {
424 status = .bsls_found
425 i += char_len
426 continue
427 }
428
429 // check if is our bsls char, for now only one length sequence
430 if status == .bsls_found {
431 for c, x in bsls_validator_array {
432 if x.ch == ch {
433 return c, i - in_i + 1, hex_res
434 }
435 }
436
437 // check for \x00 hex 8bit
438 if ch == `x` {
439 status = .hex_char
440 hex_max_len = 2
441 i += char_len
442 continue
443 }
444
445 // check for \x00 hex 16bit
446 if ch == `X` {
447 status = .hex_char
448 hex_max_len = 4
449 i += char_len
450 continue
451 }
452
453 status = .normal_char
454 continue
455 }
456
457 // manage hex byte
458 if status == .hex_char {
459 if ch >= `0` && ch <= `9` {
460 hex_count++
461 hex_res <<= 4
462 hex_res += u32(ch - `0`)
463 i += char_len
464 } else if ch >= `A` && ch <= `F` {
465 hex_count++
466 hex_res <<= 4
467 hex_res += u32(ch - `A` + 10)
468 i += char_len
469 } else if ch >= `a` && ch <= `f` {
470 hex_count++
471 hex_res <<= 4
472 hex_res += u32(ch - `a` + 10)
473 i += char_len
474 } else {
475 return err_syntax_error, i, hex_res
476 }
477
478 // println("hex_res: ${hex_res:08x} hex_count: ${hex_count}")
479
480 // look for more hex digits
481 if hex_count < hex_max_len {
482 continue
483 }
484
485 // if over 8 nibble is more than 32 bit, error
486 if hex_count > hex_max_len {
487 return err_syntax_error, i - in_i, hex_res
488 }
489
490 if hex_count == hex_max_len {
491 // we have a good result
492 // println("RESULT hex_res: ${hex_res:08x} hex_count: ${hex_count}")
493 return -2, i - in_i, hex_res
494 }
495
496 // MUST NOT BE HERE!
497 return err_syntax_error, i, hex_res
498 }
499
500 // no BSLS validator, manage as normal escape char char
501 if status == .normal_char {
502 if ch in bsls_escape_list {
503 return no_match_found, i - in_i + 1, hex_res
504 }
505 return err_syntax_error, i - in_i + 1, hex_res
506 }
507
508 // at the present time we manage only one char after the \
509 break
510 }
511 // not our bsls return KO
512 return err_syntax_error, i, hex_res
513}
514
515/******************************************************************************
516*
517* Char class
518*
519******************************************************************************/
520const cc_null = 0 // empty cc token
521
522const cc_char = 1 // simple char: a
523
524const cc_int = 2 // char interval: a-z
525
526const cc_bsls = 3 // backslash char
527
528const cc_end = 4
529
530struct CharClass {
531mut:
532 cc_type int // type of cc token
533 ch0 rune // first char of the interval a-b a in this case
534 ch1 rune // second char of the interval a-b b in this case
535 validator FnValidator = unsafe { nil } // validator function pointer
536}
537
538enum CharClass_parse_state {
539 start
540 in_char
541 in_bsls
542 separator
543 finish
544}
545
546fn (re &RE) get_char_class(pc int) string {
547 buf := []u8{len: (re.cc.len)}
548 mut buf_ptr := unsafe { &u8(&buf) }
549
550 mut cc_i := re.prog[pc].cc_index
551 mut i := 0
552 mut tmp := 0
553 for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end {
554 if re.cc[cc_i].cc_type == cc_bsls {
555 unsafe {
556 buf_ptr[i] = `\\`
557 i++
558 buf_ptr[i] = u8(re.cc[cc_i].ch0)
559 i++
560 }
561 } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 {
562 tmp = 3
563 for tmp >= 0 {
564 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
565 if x != 0 {
566 unsafe {
567 buf_ptr[i] = x
568 i++
569 }
570 }
571 tmp--
572 }
573 } else {
574 tmp = 3
575 for tmp >= 0 {
576 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
577 if x != 0 {
578 unsafe {
579 buf_ptr[i] = x
580 i++
581 }
582 }
583 tmp--
584 }
585 unsafe {
586 buf_ptr[i] = `-`
587 i++
588 }
589 tmp = 3
590 for tmp >= 0 {
591 x := u8((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF)
592 if x != 0 {
593 unsafe {
594 buf_ptr[i] = x
595 i++
596 }
597 }
598 tmp--
599 }
600 }
601 cc_i++
602 }
603 unsafe {
604 buf_ptr[i] = u8(0)
605 }
606 return unsafe { tos_clone(buf_ptr) }
607}
608
609fn (re &RE) check_char_class(pc int, ch rune) bool {
610 mut cc_i := re.prog[pc].cc_index
611 for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end {
612 if re.cc[cc_i].cc_type == cc_bsls {
613 if re.validator_matches(re.cc[cc_i].validator, ch) {
614 return true
615 }
616 } else if re.char_in_range(ch, re.cc[cc_i].ch0, re.cc[cc_i].ch1) {
617 return true
618 }
619 cc_i++
620 }
621 return false
622}
623
624@[inline]
625fn (re &RE) chars_equal(a rune, b rune) bool {
626 if a == b {
627 return true
628 }
629 if (re.flag & f_ci) == 0 {
630 return false
631 }
632 lower_a := re.case_transform_char(a, false)
633 lower_b := re.case_transform_char(b, false)
634 if lower_a == lower_b {
635 return true
636 }
637 return re.case_transform_char(a, true) == re.case_transform_char(b, true)
638}
639
640@[inline]
641fn (re &RE) char_in_range(ch rune, start rune, end rune) bool {
642 if ch >= start && ch <= end {
643 return true
644 }
645 if (re.flag & f_ci) == 0 {
646 return false
647 }
648 lower := re.case_transform_char(ch, false)
649 if lower != ch && lower >= start && lower <= end {
650 return true
651 }
652 upper := re.case_transform_char(ch, true)
653 return upper != ch && upper >= start && upper <= end
654}
655
656@[inline]
657fn (re &RE) validator_matches(validator FnValidator, ch rune) bool {
658 if validator(u8(ch)) {
659 return true
660 }
661 if (re.flag & f_ci) == 0 {
662 return false
663 }
664 lower := re.case_transform_char(ch, false)
665 if lower != ch && validator(u8(lower)) {
666 return true
667 }
668 upper := re.case_transform_char(ch, true)
669 return upper != ch && validator(u8(upper))
670}
671
672fn (re &RE) case_transform_char(ch rune, upper bool) rune {
673 if upper {
674 if ch >= `a` && ch <= `z` {
675 return ch - 32
676 }
677 return ch
678 }
679 if ch >= `A` && ch <= `Z` {
680 return ch + 32
681 }
682 return ch
683}
684
685@[inline]
686fn (re &RE) token_matches(pc int, ch rune) bool {
687 return match re.prog[pc].ist {
688 ist_simple_char { re.chars_equal(re.prog[pc].ch, ch) }
689 ist_char_class_pos { re.check_char_class(pc, ch) }
690 ist_char_class_neg { !re.check_char_class(pc, ch) }
691 ist_bsls_char { re.validator_matches(re.prog[pc].validator, ch) }
692 else { false }
693 }
694}
695
696// parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char
697fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, u32) {
698 mut status := CharClass_parse_state.start
699 mut i := in_i
700
701 mut tmp_index := re.cc_index
702 res_index := re.cc_index
703
704 mut cc_type := u32(ist_char_class_pos)
705
706 for i < in_txt.len {
707 // check if we are out of memory for char classes
708 if tmp_index >= re.cc.len {
709 return err_cc_alloc_overflow, 0, u32(0)
710 }
711
712 // get our char
713 char_tmp, char_len := re.get_char(in_txt, i)
714 ch := u8(char_tmp)
715
716 // println("CC #${i:3d} ch: ${ch:c}")
717
718 // negation
719 if status == .start && ch == `^` {
720 cc_type = u32(ist_char_class_neg)
721 i += char_len
722 continue
723 }
724
725 // minus symbol
726 if status == .start && ch == `-` {
727 re.cc[tmp_index].cc_type = cc_char
728 re.cc[tmp_index].ch0 = char_tmp
729 re.cc[tmp_index].ch1 = char_tmp
730 i += char_len
731 tmp_index++
732 continue
733 }
734
735 // bsls
736 if (status == .start || status == .in_char) && ch == `\\` {
737 // println("CC bsls.")
738 status = .in_bsls
739 i += char_len
740 continue
741 }
742
743 if status == .in_bsls {
744 // println("CC bsls validation.")
745 for c, x in bsls_validator_array {
746 if x.ch == ch {
747 // println("CC bsls found [${ch:c}]")
748 re.cc[tmp_index].cc_type = cc_bsls
749 re.cc[tmp_index].ch0 = bsls_validator_array[c].ch
750 re.cc[tmp_index].ch1 = bsls_validator_array[c].ch
751 re.cc[tmp_index].validator = bsls_validator_array[c].validator
752 i += char_len
753 tmp_index++
754 status = .in_char
755 break
756 }
757 }
758 if status == .in_bsls {
759 // manage as a simple char
760 // println("CC bsls not found [${ch:c}]")
761 re.cc[tmp_index].cc_type = cc_char
762 re.cc[tmp_index].ch0 = char_tmp
763 re.cc[tmp_index].ch1 = char_tmp
764 i += char_len
765 tmp_index++
766 status = .in_char
767 continue
768 } else {
769 continue
770 }
771 }
772
773 // simple char
774 if (status == .start || status == .in_char) && ch != `-` && ch != `]` {
775 status = .in_char
776
777 re.cc[tmp_index].cc_type = cc_char
778 re.cc[tmp_index].ch0 = char_tmp
779 re.cc[tmp_index].ch1 = char_tmp
780
781 i += char_len
782 tmp_index++
783 continue
784 }
785
786 // check range separator
787 if status == .in_char && ch == `-` {
788 status = .separator
789 i += char_len
790 continue
791 }
792
793 // check range end
794 if status == .separator && ch != `]` && ch != `-` {
795 status = .in_char
796 re.cc[tmp_index - 1].cc_type = cc_int
797 re.cc[tmp_index - 1].ch1 = char_tmp
798 i += char_len
799 continue
800 }
801
802 // char class end
803 if status == .in_char && ch == `]` {
804 re.cc[tmp_index].cc_type = cc_end
805 re.cc[tmp_index].ch0 = 0
806 re.cc[tmp_index].ch1 = 0
807 re.cc_index = tmp_index + 1
808
809 return res_index, i - in_i + 2, cc_type
810 }
811
812 i++
813 }
814 return err_syntax_error, 0, u32(0)
815}
816
817/******************************************************************************
818*
819* Re Compiler
820*
821******************************************************************************/
822//
823// Quantifier
824//
825enum Quant_parse_state {
826 start
827 min_parse
828 comma_checked
829 max_parse
830 greedy
831 gredy_parse
832 finish
833}
834
835// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
836fn (re &RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
837 mut status := Quant_parse_state.start
838 mut i := in_i
839
840 mut q_min := 0 // default min in a {} quantifier is 1
841 mut q_max := 0 // default max in a {} quantifier is max_quantifier
842
843 mut ch := u8(0)
844
845 for i < in_txt.len {
846 unsafe {
847 ch = in_txt.str[i]
848 }
849 // println("${ch:c} status: ${status}")
850
851 // exit on no compatible char with {} quantifier
852 if utf8util_char_len(ch) != 1 {
853 return err_syntax_error, i, 0, false
854 }
855
856 // min parsing skip if comma present
857 if status == .start && ch == `,` {
858 q_min = 0 // default min in a {} quantifier is 0
859 status = .comma_checked
860 i++
861 continue
862 }
863
864 if status == .start && is_digit(ch) {
865 status = .min_parse
866 q_min *= 10
867 q_min += int(ch - `0`)
868 i++
869 continue
870 }
871
872 if status == .min_parse && is_digit(ch) {
873 q_min *= 10
874 q_min += int(ch - `0`)
875 i++
876 continue
877 }
878
879 // we have parsed the min, now check the max
880 if status == .min_parse && ch == `,` {
881 status = .comma_checked
882 i++
883 continue
884 }
885
886 // single value {4}
887 if status == .min_parse && ch == `}` {
888 q_max = q_min
889 status = .greedy
890 continue
891 }
892
893 // end without max
894 if status == .comma_checked && ch == `}` {
895 q_max = max_quantifier
896 status = .greedy
897 continue
898 }
899
900 // start max parsing
901 if status == .comma_checked && is_digit(ch) {
902 status = .max_parse
903 q_max *= 10
904 q_max += int(ch - `0`)
905 i++
906 continue
907 }
908
909 // parse the max
910 if status == .max_parse && is_digit(ch) {
911 q_max *= 10
912 q_max += int(ch - `0`)
913 i++
914 continue
915 }
916
917 // finished the quantifier
918 if status == .max_parse && ch == `}` {
919 status = .greedy
920 continue
921 }
922
923 // check if greedy flag char ? is present
924 if status == .greedy {
925 if i + 1 < in_txt.len {
926 i++
927 status = .gredy_parse
928 continue
929 }
930 return q_min, q_max, i - in_i + 2, false
931 }
932
933 // check the greedy flag
934 if status == .gredy_parse {
935 if ch == `?` {
936 return q_min, q_max, i - in_i + 2, true
937 } else {
938 i--
939 return q_min, q_max, i - in_i + 2, false
940 }
941 }
942
943 // not a {} quantifier, exit
944 return err_syntax_error, i, 0, false
945 }
946
947 // not a conform {} quantifier
948 return err_syntax_error, i, 0, false
949}
950
951//
952// Groups
953//
954enum Group_parse_state {
955 start
956 q_mark // (?
957 q_mark1 // (?:|P checking
958 p_status // (?P
959 p_start // (?P<
960 p_end // (?P<...>
961 p_in_name // (?P<...
962 finish
963}
964
965// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
966fn (re &RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
967 mut status := Group_parse_state.start
968 mut i := in_i
969 mut name := ''
970
971 for i < in_txt.len && status != .finish {
972 // get our char
973 char_tmp, char_len := re.get_char(in_txt, i)
974 ch := u8(char_tmp)
975
976 // start
977 if status == .start && ch == `(` {
978 status = .q_mark
979 i += char_len
980 continue
981 }
982
983 // check for question marks
984 if status == .q_mark && ch == `?` {
985 status = .q_mark1
986 i += char_len
987 continue
988 }
989
990 // negate group
991 if status == .q_mark1 && ch == `!` {
992 i += char_len
993 return 0, false, true, name, i
994 }
995
996 // non capturing group
997 if status == .q_mark1 && ch == `:` {
998 i += char_len
999 return 0, false, false, name, i
1000 }
1001
1002 // enter in P section
1003 if status == .q_mark1 && ch == `P` {
1004 status = .p_status
1005 i += char_len
1006 continue
1007 }
1008
1009 // not a valid q mark found
1010 if status == .q_mark1 {
1011 // println("NO VALID Q MARK")
1012 return -2, true, false, name, i
1013 }
1014
1015 if status == .p_status && ch == `<` {
1016 status = .p_start
1017 i += char_len
1018 continue
1019 }
1020
1021 if status == .p_start && ch != `>` {
1022 status = .p_in_name
1023 name += '${ch:1c}' // TODO: manage utf8 chars
1024 i += char_len
1025 continue
1026 }
1027
1028 // colect name
1029 if status == .p_in_name && ch != `>` && is_alnum(ch) {
1030 name += '${ch:1c}' // TODO: manage utf8 chars
1031 i += char_len
1032 continue
1033 }
1034
1035 // end name
1036 if status == .p_in_name && ch == `>` {
1037 i += char_len
1038 return 0, true, false, name, i
1039 }
1040
1041 // error on name group
1042 if status == .p_in_name {
1043 return -2, true, false, name, i
1044 }
1045
1046 // normal group, nothig to do, exit
1047 return 0, true, false, name, i
1048 }
1049 // UNREACHABLE
1050 // println("ERROR!! NOT MEANT TO BE HERE!!1")
1051 return -2, true, false, name, i
1052}
1053
1054const quntifier_chars = [rune(`+`), `*`, `?`, `{`]
1055
1056//
1057// main compiler
1058//
1059// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
1060fn (mut re RE) impl_compile(in_txt string) (int, int) {
1061 mut i := 0 // input string index
1062 mut pc := 0 // program counter
1063
1064 // group management variables
1065 mut group_count := -1
1066 mut group_stack := []int{len: re.group_max_nested, init: 0}
1067 mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
1068 mut group_stack_index := -1
1069
1070 re.query = in_txt // save the query string
1071
1072 i = 0
1073 for i < in_txt.len {
1074 mut char_tmp := u32(0)
1075 mut char_len := 0
1076 // println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
1077
1078 char_tmp, char_len = re.get_char(in_txt, i)
1079
1080 //
1081 // check special cases: $ ^
1082 //
1083 if char_len == 1 && i == 0 && u8(char_tmp) == `^` {
1084 re.flag |= f_ms
1085 i = i + char_len
1086 continue
1087 }
1088 if char_len == 1 && i == (in_txt.len - 1) && u8(char_tmp) == `$` {
1089 re.flag |= f_me
1090 i = i + char_len
1091 continue
1092 }
1093
1094 // ist_group_start
1095 if char_len == 1 && pc >= 0 && u8(char_tmp) == `(` {
1096 // check max groups allowed
1097 if group_count > re.group_max {
1098 return err_groups_overflow, i + 1
1099 }
1100 group_stack_index++
1101
1102 // check max nested groups allowed
1103 if group_stack_index > re.group_max_nested {
1104 return err_groups_max_nested, i + 1
1105 }
1106
1107 tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt, i)
1108
1109 // manage question mark format error
1110 if tmp_res < -1 {
1111 return err_group_qm_notation, next_i
1112 }
1113
1114 // println("Parse group: [${tmp_res}, ${cgroup_flag}, (${i},${next_i}), '${in_txt[i..next_i]}' ]")
1115 i = next_i
1116
1117 if cgroup_flag == true {
1118 group_count++
1119 }
1120
1121 // calculate the group id
1122 // if it is a named group, recycle the group id
1123 // NOTE: **** the group index is +1 because map return 0 when not found!! ****
1124 mut group_id := group_count
1125 if cgroup_name.len > 0 {
1126 // println("GROUP NAME: ${cgroup_name}")
1127 if cgroup_name in re.group_map {
1128 group_id = re.group_map[cgroup_name] - 1
1129 group_count--
1130 } else {
1131 re.group_map[cgroup_name] = group_id + 1
1132 }
1133 }
1134
1135 group_stack_txt_index[group_stack_index] = i
1136 group_stack[group_stack_index] = pc
1137
1138 re.prog[pc].ist = u32(0) | ist_group_start
1139 re.prog[pc].rep_min = 1
1140 re.prog[pc].rep_max = 1
1141
1142 // manage negation groups
1143 if negate_flag == true {
1144 re.prog[pc].group_neg = true
1145 re.prog[pc].rep_min = 0 // may not be caught, but it is ok
1146 }
1147
1148 // set the group id
1149 if cgroup_flag == false {
1150 // println("NO CAPTURE GROUP")
1151 re.prog[pc].group_id = -1
1152 } else {
1153 re.prog[pc].group_id = group_id
1154 }
1155
1156 pc = pc + 1
1157 continue
1158 }
1159
1160 // ist_group_end
1161 if char_len == 1 && pc > 0 && u8(char_tmp) == `)` {
1162 if group_stack_index < 0 {
1163 return err_group_not_balanced, i + 1
1164 }
1165
1166 goto_pc := group_stack[group_stack_index]
1167 group_stack_index--
1168
1169 re.prog[pc].ist = u32(0) | ist_group_end
1170 re.prog[pc].rep_min = 1
1171 re.prog[pc].rep_max = 1
1172
1173 re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need
1174 re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data
1175
1176 re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
1177 // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data
1178
1179 // duplicate the negation group info and settings
1180 if re.prog[goto_pc].group_neg == true {
1181 re.prog[pc].group_neg = re.prog[goto_pc].group_neg
1182 re.prog[pc].rep_min = re.prog[goto_pc].rep_min
1183 }
1184
1185 pc = pc + 1
1186 i = i + char_len
1187 continue
1188 }
1189
1190 // ist_dot_char match any char except the following token
1191 if char_len == 1 && pc >= 0 && u8(char_tmp) == `.` {
1192 // consecutive ist_dot_char is a syntax error
1193 if pc > 0 && re.prog[pc - 1].ist == ist_dot_char {
1194 return err_consecutive_dots, i
1195 }
1196
1197 re.prog[pc].ist = u32(0) | ist_dot_char
1198 re.prog[pc].rep_min = 1
1199 re.prog[pc].rep_max = 1
1200 pc = pc + 1
1201 i = i + char_len
1202 continue
1203 }
1204
1205 // OR branch
1206 if char_len == 1 && pc > 0 && u8(char_tmp) == `|` {
1207 if pc > 0 && re.prog[pc - 1].ist == ist_or_branch {
1208 return err_syntax_error, i
1209 }
1210 re.prog[pc].ist = u32(0) | ist_or_branch
1211 re.prog[pc].source_index = i
1212 pc = pc + 1
1213 i = i + char_len
1214 continue
1215 }
1216
1217 // Quantifiers
1218 if char_len == 1 && pc > 0 {
1219 mut char_next := rune(0)
1220 mut char_next_len := 0
1221 if (char_len + i) < in_txt.len {
1222 char_next, char_next_len = re.get_char(in_txt, i + char_len)
1223 }
1224 mut quant_flag := true
1225
1226 // negation groups can not have quantifiers
1227 if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
1228 return err_neg_group_quantifier, i
1229 }
1230
1231 match u8(char_tmp) {
1232 `?` {
1233 // println("q: ${char_tmp:c}")
1234 // check illegal quantifier sequences
1235 if char_next_len == 1 && char_next in quntifier_chars {
1236 return err_syntax_error, i
1237 }
1238 re.prog[pc - 1].rep_min = 0
1239 re.prog[pc - 1].rep_max = 1
1240 }
1241 `+` {
1242 // println("q: ${char_tmp:c}")
1243 // check illegal quantifier sequences
1244 if char_next_len == 1 && char_next in quntifier_chars {
1245 return err_syntax_error, i
1246 }
1247 re.prog[pc - 1].rep_min = 1
1248 re.prog[pc - 1].rep_max = max_quantifier
1249 }
1250 `*` {
1251 // println("q: ${char_tmp:c}")
1252 // check illegal quantifier sequences
1253 if char_next_len == 1 && char_next in quntifier_chars {
1254 return err_syntax_error, i
1255 }
1256 re.prog[pc - 1].rep_min = 0
1257 re.prog[pc - 1].rep_max = max_quantifier
1258 }
1259 `{` {
1260 min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1)
1261 // it is a quantifier
1262 if min >= 0 {
1263 // println("{${min},${max}}\n str:[${in_txt[i..i+tmp]}] greedy:${greedy}")
1264 i = i + tmp
1265 re.prog[pc - 1].rep_min = min
1266 re.prog[pc - 1].rep_max = max
1267 re.prog[pc - 1].greedy = greedy
1268 // check illegal quantifier sequences
1269 if i <= in_txt.len {
1270 char_next, char_next_len = re.get_char(in_txt, i)
1271 if char_next_len == 1 && char_next in quntifier_chars {
1272 return err_syntax_error, i
1273 }
1274 }
1275 continue
1276 } else {
1277 return min, i
1278 }
1279
1280 // TODO: decide if the open bracket can be conform without the close bracket
1281 /*
1282 // no conform, parse as normal char
1283 else {
1284 quant_flag = false
1285 }
1286 */
1287 }
1288 else {
1289 quant_flag = false
1290 }
1291 }
1292
1293 if quant_flag {
1294 i = i + char_len
1295 continue
1296 }
1297 }
1298
1299 // IST_CHAR_CLASS_*
1300 if char_len == 1 && pc >= 0 {
1301 if u8(char_tmp) == `[` {
1302 cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1)
1303 if cc_index >= 0 {
1304 // println("index: ${cc_index} str:${in_txt[i..i+tmp]}")
1305 i = i + tmp
1306 re.prog[pc].ist = u32(0) | cc_type
1307 re.prog[pc].cc_index = cc_index
1308 re.prog[pc].rep_min = 1
1309 re.prog[pc].rep_max = 1
1310 pc = pc + 1
1311 continue
1312 }
1313 // cc_class vector memory full
1314 else if cc_index < 0 {
1315 return cc_index, i
1316 }
1317 }
1318 }
1319
1320 // ist_bsls_char
1321 if char_len == 1 && pc >= 0 {
1322 if u8(char_tmp) == `\\` {
1323 // if the index is negative:
1324 // -1 ERROR
1325 // -2 hex byte code BLSL
1326 bsls_index, tmp, hex_res := re.parse_bsls(in_txt, i)
1327 if bsls_index >= 0 {
1328 i = i + tmp
1329 re.prog[pc].ist = u32(0) | ist_bsls_char
1330 re.prog[pc].rep_min = 1
1331 re.prog[pc].rep_max = 1
1332 re.prog[pc].validator = bsls_validator_array[bsls_index].validator
1333 re.prog[pc].ch = bsls_validator_array[bsls_index].ch
1334 pc = pc + 1
1335 continue
1336 }
1337 // hex char
1338 // this code can mange up to \x for 32 bit
1339 // at the present time only 8/16 bit are used
1340 else if bsls_index == -2 {
1341 mut value := hex_res
1342 mut value_list := []u32{cap: 4}
1343 mut count := 0
1344 for value > 0 {
1345 value_list << value & 0xFF
1346 value = value >> 8
1347 count++
1348 }
1349
1350 count--
1351 for count >= 0 {
1352 re.prog[pc].ist = ist_simple_char
1353 re.prog[pc].ch = value_list[count]
1354 re.prog[pc].ch_len = u8(char_len)
1355 re.prog[pc].rep_min = 1
1356 re.prog[pc].rep_max = 1
1357 re.prog[pc].flag = 1 // state a byte char
1358 // println("char: ${char_tmp:c}")
1359 pc = pc + 1
1360 count--
1361 }
1362 i = i + tmp
1363 continue
1364 }
1365 // this is an escape char, skip the bsls and continue as a normal char
1366 else if bsls_index == no_match_found {
1367 i += char_len
1368 char_tmp, char_len = re.get_char(in_txt, i)
1369 // continue as simple char
1370 }
1371 // if not an escape or a bsls char then it is an error (at least for now!)
1372 else {
1373 return bsls_index, i + tmp
1374 }
1375 }
1376 }
1377
1378 // ist_simple_char
1379 re.prog[pc].ist = ist_simple_char
1380 re.prog[pc].ch = char_tmp
1381 re.prog[pc].ch_len = u8(char_len)
1382 re.prog[pc].rep_min = 1
1383 re.prog[pc].rep_max = 1
1384 // println("char: ${char_tmp:c}")
1385 pc = pc + 1
1386
1387 i += char_len
1388 }
1389
1390 // add end of the program
1391 re.prog[pc].ist = ist_prog_end
1392 re.prog_len = pc
1393
1394 // check for unbalanced groups
1395 if group_stack_index != -1 {
1396 return err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1
1397 }
1398
1399 // check for OR at the end of the program
1400 if pc > 0 && re.prog[pc - 1].ist == ist_or_branch {
1401 return err_syntax_error, in_txt.len - 1
1402 }
1403
1404 // store the number of groups in the query
1405 re.group_count = group_count + 1
1406
1407 //******************************************
1408 // Post processing
1409 //******************************************
1410
1411 //
1412 // manage ist_dot_char
1413 //
1414
1415 // find the checks for dot chars, if any...
1416 mut pc1 := 0
1417 mut dot_char_count := 0
1418 mut last_dot_char_pc := -1
1419 for pc1 < pc {
1420 if re.prog[pc1].ist == ist_dot_char {
1421 // println("Dot_char pc: ${pc1}")
1422 last_dot_char_pc = pc1
1423 dot_char_count++
1424 mut pc2 := pc1 + 1
1425 for pc2 < pc {
1426 // consecutive dot chars is an error
1427 if re.prog[pc2].ist == ist_dot_char {
1428 return err_syntax_error, 0
1429 }
1430 if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] {
1431 // println("Next dot char check is PC: ${pc2}")
1432 re.prog[pc1].dot_check_pc = pc2
1433 break
1434 }
1435 pc2++
1436 }
1437 }
1438 pc1++
1439 }
1440
1441 // println("last_dot_char_pc: ${last_dot_char_pc}")
1442 if last_dot_char_pc >= 0 {
1443 pc1 = last_dot_char_pc + 1
1444 mut is_last_dot := true
1445 for pc1 < pc {
1446 if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] {
1447 is_last_dot = false
1448 break
1449 }
1450 pc1++
1451 }
1452 if is_last_dot {
1453 re.prog[last_dot_char_pc].last_dot_flag = true
1454 }
1455 }
1456
1457 //
1458 // manage bsls_char
1459 //
1460
1461 // find the checks for bsls, if any...
1462 pc1 = 0
1463 mut bsls_char_count := 0
1464 mut last_bsls_char_pc := -1
1465 for pc1 < pc {
1466 if re.prog[pc1].ist == ist_bsls_char {
1467 // println("bsls_char pc: ${pc1}")
1468 last_bsls_char_pc = pc1
1469 bsls_char_count++
1470 mut pc2 := pc1 + 1
1471 for pc2 < pc {
1472 if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] {
1473 // println("Next bsls check is PC: ${pc2}")
1474 re.prog[pc1].bsls_check_pc = pc2
1475 break
1476 }
1477 pc2++
1478 }
1479 }
1480 pc1++
1481 }
1482
1483 // println('last_bsls_char_pc: ${last_bsls_char_pc}')
1484 if last_bsls_char_pc >= 0 {
1485 pc1 = last_bsls_char_pc + 1
1486 mut is_last_bsls := true
1487 for pc1 < pc {
1488 if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] {
1489 is_last_bsls = false
1490 break
1491 }
1492 pc1++
1493 }
1494 if is_last_bsls {
1495 re.prog[last_bsls_char_pc].last_dot_flag = true
1496 }
1497 }
1498
1499 //
1500 // manage CC
1501 //
1502 pc1 = 0
1503 mut cc_char_count := 0
1504 mut last_cc_char_pc := -1
1505 for pc1 < pc {
1506 if re.prog[pc1].ist in [u32(ist_char_class_pos), ist_char_class_neg] {
1507 last_cc_char_pc = pc1
1508 cc_char_count++
1509 mut pc2 := pc1 + 1
1510 for pc2 < pc {
1511 if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] {
1512 // println("Next CC check is PC: ${pc2}")
1513 re.prog[pc1].cc_check_pc = pc2
1514 break
1515 }
1516 pc2++
1517 }
1518 }
1519 pc1++
1520 }
1521
1522 // println('last_cc_char_pc: ${last_cc_char_pc}')
1523 if last_cc_char_pc >= 0 {
1524 pc1 = last_cc_char_pc + 1
1525 mut is_last_cc := true
1526 for pc1 < pc {
1527 if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] {
1528 is_last_cc = false
1529 break
1530 }
1531 pc1++
1532 }
1533 if is_last_cc {
1534 re.prog[last_cc_char_pc].last_dot_flag = true
1535 }
1536 }
1537
1538 //******************************************
1539
1540 // OR branch
1541 // a|b|cd
1542 // d exit point
1543 // a,b,c branches
1544 // set the jump in the right places
1545 pc1 = 0
1546 for pc1 < pc - 2 {
1547 // println("Here ${pc1} ${pc-2}")
1548 // println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}")
1549 if re.prog[pc1 + 1].ist == ist_or_branch {
1550 // two consecutive OR are a syntax error
1551 if re.prog[pc1 + 2].ist == ist_or_branch {
1552 return err_syntax_error, i
1553 }
1554
1555 // check for []|[] errors
1556 if re.prog[pc1].ist == ist_char_class_pos && re.prog[pc1 + 2].ist == ist_char_class_pos {
1557 return err_invalid_or_with_cc, re.prog[pc1 + 1].source_index
1558 }
1559 }
1560
1561 // manange a|b chains like a|(b)|c|d...
1562 // standard solution
1563 if re.prog[pc1].ist != ist_or_branch && re.prog[pc1 + 1].ist == ist_or_branch
1564 && re.prog[pc1 + 2].ist != ist_or_branch {
1565 re.prog[pc1].next_is_or = true // set that the next token is an OR
1566 re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump
1567
1568 // match jump, if an OR chain the next token will be an OR token
1569 mut pc2 := pc1 + 2
1570 for pc2 < pc - 1 {
1571 ist := re.prog[pc2].ist
1572 if ist == ist_group_start {
1573 re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1
1574 break
1575 }
1576 if ist != ist_or_branch {
1577 re.prog[pc1 + 1].rep_max = pc2 + 1
1578 break
1579 }
1580
1581 pc2++
1582 }
1583 // special case query of few chars, the true can't go on the first instruction
1584 if re.prog[pc1 + 1].rep_max == pc1 {
1585 re.prog[pc1 + 1].rep_max = 3
1586 }
1587 // println("Compile OR postproc. [${pc1},OR ${pc1+1},${pc2}]")
1588 pc1 = pc2
1589 continue
1590 }
1591
1592 pc1++
1593 }
1594
1595 //******************************************
1596 // DEBUG PRINT REGEX GENERATED CODE
1597 //******************************************
1598 if re.debug > 0 {
1599 gc := re.get_code()
1600 re.log_func(gc)
1601 }
1602 //******************************************
1603
1604 return compile_ok, 0
1605}
1606
1607// get_code return the compiled code as regex string, note: may be different from the source!
1608pub fn (re &RE) get_code() string {
1609 mut pc1 := 0
1610 mut res := strings.new_builder(re.cc.len * 2 * re.prog.len)
1611 res.write_string('========================================\nv RegEx compiler v ${v_regex_version} output:\n')
1612
1613 mut stop_flag := false
1614
1615 for pc1 <= re.prog.len {
1616 tk := re.prog[pc1]
1617 res.write_string('PC:${pc1:3d}')
1618
1619 res.write_string(' ist: ')
1620 res.write_string('${tk.ist:8x}'.replace(' ', '0'))
1621 res.write_string(' ')
1622 ist := tk.ist
1623 if ist == ist_bsls_char {
1624 res.write_string('[\\${tk.ch:1c}] BSLS')
1625 if tk.last_dot_flag == true {
1626 res.write_string(' last!')
1627 }
1628 } else if ist == ist_prog_end {
1629 res.write_string('PROG_END')
1630 stop_flag = true
1631 } else if ist == ist_or_branch {
1632 res.write_string('OR ')
1633 } else if ist == ist_char_class_pos {
1634 res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS')
1635 if tk.last_dot_flag == true {
1636 res.write_string(' last!')
1637 }
1638 } else if ist == ist_char_class_neg {
1639 res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG')
1640 if tk.last_dot_flag == true {
1641 res.write_string(' last!')
1642 }
1643 } else if ist == ist_dot_char {
1644 res.write_string('. DOT_CHAR nx chk: ${tk.dot_check_pc}')
1645 if tk.last_dot_flag == true {
1646 res.write_string(' last!')
1647 }
1648 } else if ist == ist_group_start {
1649 res.write_string('( GROUP_START #:${tk.group_id}')
1650 if tk.group_id == -1 {
1651 res.write_string(' ?:')
1652 } else {
1653 for x in re.group_map.keys() {
1654 if re.group_map[x] == (tk.group_id + 1) {
1655 res.write_string(' ?P<${x}>')
1656 break
1657 }
1658 }
1659 }
1660 } else if ist == ist_group_end {
1661 res.write_string(') GROUP_END #:${tk.group_id}')
1662 } else if ist == ist_simple_char {
1663 if tk.flag == 0 {
1664 res.write_string('[${tk.ch:1c}] query_ch')
1665 } else {
1666 res.write_string('[0x${tk.ch:02X}]HEXquery_ch')
1667 }
1668 }
1669
1670 if tk.rep_max == max_quantifier {
1671 res.write_string(' {${tk.rep_min:3d},MAX}')
1672 } else {
1673 if ist == ist_or_branch {
1674 res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}')
1675 } else {
1676 res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}')
1677 }
1678 if tk.greedy == true {
1679 res.write_string('?')
1680 }
1681 }
1682
1683 res.write_string('\n')
1684 if stop_flag {
1685 break
1686 }
1687 pc1++
1688 }
1689
1690 res.write_string('========================================\n')
1691 return res.str()
1692}
1693
1694// get_query return a string with a reconstruction of the query starting from the regex program code
1695pub fn (re &RE) get_query() string {
1696 mut res := strings.new_builder(re.query.len * 2)
1697
1698 if (re.flag & f_ms) != 0 {
1699 res.write_string('^')
1700 }
1701
1702 mut i := 0
1703 for i < re.prog.len && re.prog[i].ist != ist_prog_end && re.prog[i].ist != 0 {
1704 tk := unsafe { &re.prog[i] }
1705 ch := tk.ist
1706
1707 // GROUP start
1708 if ch == ist_group_start {
1709 if re.debug > 0 {
1710 res.write_string('#${tk.group_id}')
1711 }
1712 res.write_string('(')
1713
1714 if tk.group_neg == true {
1715 res.write_string('?!') // negation group
1716 } else if tk.group_id == -1 {
1717 res.write_string('?:') // non capturing group
1718 }
1719
1720 for x in re.group_map.keys() {
1721 if re.group_map[x] == (tk.group_id + 1) {
1722 res.write_string('?P<${x}>')
1723 break
1724 }
1725 }
1726
1727 i++
1728 continue
1729 }
1730
1731 // GROUP end
1732 if ch == ist_group_end {
1733 res.write_string(')')
1734 }
1735
1736 // OR branch
1737 if ch == ist_or_branch {
1738 res.write_string('|')
1739 if re.debug > 0 {
1740 res.write_string('{${tk.rep_min},${tk.rep_max}}')
1741 }
1742 i++
1743 continue
1744 }
1745
1746 // char class
1747 if ch == ist_char_class_neg || ch == ist_char_class_pos {
1748 res.write_string('[')
1749 if ch == ist_char_class_neg {
1750 res.write_string('^')
1751 }
1752 res.write_string('${re.get_char_class(i)}')
1753 res.write_string(']')
1754 }
1755
1756 // bsls char
1757 if ch == ist_bsls_char {
1758 res.write_string('\\${tk.ch:1c}')
1759 }
1760
1761 // ist_dot_char
1762 if ch == ist_dot_char {
1763 res.write_string('.')
1764 }
1765
1766 // char alone
1767 if ch == ist_simple_char {
1768 if tk.flag == 0 {
1769 if u8(ch) in bsls_escape_list {
1770 res.write_string('\\')
1771 }
1772 res.write_string('${tk.ch:c}')
1773 } else {
1774 res.write_string('\\x${tk.ch:02x}')
1775 }
1776 }
1777
1778 // quantifier
1779 if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
1780 if tk.rep_min == 0 && tk.rep_max == 1 {
1781 res.write_string('?')
1782 } else if tk.rep_min == 1 && tk.rep_max == max_quantifier {
1783 res.write_string('+')
1784 } else if tk.rep_min == 0 && tk.rep_max == max_quantifier {
1785 res.write_string('*')
1786 } else {
1787 if tk.rep_max == max_quantifier {
1788 res.write_string('{${tk.rep_min},MAX}')
1789 } else {
1790 res.write_string('{${tk.rep_min},${tk.rep_max}}')
1791 }
1792 if tk.greedy == true {
1793 res.write_string('?')
1794 }
1795 }
1796 }
1797 i++
1798 }
1799 if (re.flag & f_me) != 0 {
1800 res.write_string('$')
1801 }
1802
1803 return res.str()
1804}
1805
1806/******************************************************************************
1807*
1808* Groups saving utilities
1809*
1810******************************************************************************/
1811@[direct_array_access]
1812fn (mut re RE) group_continuous_save(g_index int) {
1813 if re.group_csave_flag == true {
1814 // continuous save, save until we have space
1815
1816 // init the first element as counter
1817 if re.group_csave.len == 0 {
1818 re.group_csave << 0
1819 }
1820
1821 gi := g_index >> 1
1822 start := re.groups[g_index]
1823 end := re.groups[g_index + 1]
1824
1825 // check if we are simply increasing the size ot the found group
1826 if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3]
1827 && start == re.group_csave[re.group_csave.len - 2] {
1828 re.group_csave[re.group_csave.len - 1] = end
1829 return
1830 }
1831
1832 // otherwise append a new group to the list
1833
1834 // increment counter
1835 re.group_csave[0]++
1836 // save the record
1837 re.group_csave << (g_index >> 1) // group id
1838 re.group_csave << re.groups[g_index] // start
1839 re.group_csave << re.groups[g_index + 1] // end
1840 }
1841}
1842
1843/******************************************************************************
1844*
1845* Matching
1846*
1847******************************************************************************/
1848enum Match_state {
1849 start = 0
1850 stop
1851 end
1852 new_line
1853 ist_load // load and execute instruction
1854 ist_next // go to next instruction
1855 ist_next_ks // go to next instruction without clenaning the state
1856 ist_quant_p // match positive ,quantifier check
1857 ist_quant_n // match negative, quantifier check
1858 ist_quant_pg // match positive ,group quantifier check
1859 ist_quant_ng // match negative ,group quantifier check
1860}
1861
1862fn state_str(s Match_state) string {
1863 match s {
1864 .start { return 'start' }
1865 .stop { return 'stop' }
1866 .end { return 'end' }
1867 .new_line { return 'new line' }
1868 .ist_load { return 'ist_load' }
1869 .ist_next { return 'ist_next' }
1870 .ist_next_ks { return 'ist_next_ks' }
1871 .ist_quant_p { return 'ist_quant_p' }
1872 .ist_quant_n { return 'ist_quant_n' }
1873 .ist_quant_pg { return 'ist_quant_pg' }
1874 .ist_quant_ng { return 'ist_quant_ng' }
1875 }
1876}
1877
1878struct StateObj {
1879pub mut:
1880 group_index int = -1 // group id used to know how many groups are open
1881 match_flag bool // indicate if we are in a match condition
1882 match_index int = -1 // index of the last match
1883 first_match int = -1 // index of the first match
1884 pc int = -1 // program counter
1885 i int = -1 // source string index
1886 char_len int // last char legth
1887 last_dot_pc int = -1 // last dot chat pc
1888}
1889
1890@[direct_array_access]
1891pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
1892 // result status
1893 mut result := no_match_found // function return
1894
1895 mut ch := rune(0) // examinated char
1896 mut char_len := 0 // utf8 examinated char len
1897 mut m_state := Match_state.start // start point for the matcher FSM
1898 mut src_end := false
1899 mut last_fnd_pc := -1
1900
1901 mut state := StateObj{} // actual state
1902 mut ist := u32(0) // actual instruction
1903 mut l_ist := u32(0) // last matched instruction
1904
1905 mut step_count := 0 // stats for debug
1906 mut dbg_line := 0 // count debug line printed
1907
1908 re.reset()
1909
1910 if re.debug > 0 {
1911 // print header
1912 mut h_buf := strings.new_builder(32)
1913 h_buf.write_string('flags: ')
1914 h_buf.write_string('${re.flag:8x}'.replace(' ', '0'))
1915 h_buf.write_string('\n')
1916 sss := h_buf.str()
1917 re.log_func(sss)
1918 }
1919
1920 for m_state != .end {
1921 if state.pc >= 0 && state.pc < re.prog.len {
1922 ist = re.prog[state.pc].ist
1923 } else if state.pc >= re.prog.len {
1924 // println("ERROR!! PC overflow!!")
1925 return err_internal_error, state.i
1926 }
1927
1928 //******************************************
1929 // DEBUG LOG
1930 //******************************************
1931 if re.debug > 0 {
1932 mut buf2 := strings.new_builder(re.cc.len + 128)
1933
1934 // print all the instructions
1935
1936 // end of the input text
1937 if state.i >= in_txt_len {
1938 buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n')
1939 sss := buf2.str()
1940 re.log_func(sss)
1941 } else {
1942 // print only the exe instruction
1943 if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 {
1944 if ist == ist_prog_end {
1945 buf2.write_string('# ${step_count:3d} PROG_END\n')
1946 } else if ist == 0 || m_state in [.start, .ist_next, .stop] {
1947 buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n')
1948 } else {
1949 ch, char_len = re.get_charb(in_txt, state.i)
1950
1951 buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>')
1952 buf2.write_string('${ist:8x}'.replace(' ', '0'))
1953 buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
1954
1955 if ist == ist_simple_char {
1956 if re.prog[state.pc].flag == 0 {
1957 buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
1958 } else {
1959 buf2.write_string('query_ch: [0x${re.prog[state.pc].ch:02X}]')
1960 }
1961 } else {
1962 if ist == ist_bsls_char {
1963 buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
1964 } else if ist == ist_prog_end {
1965 buf2.write_string('PROG_END')
1966 } else if ist == ist_or_branch {
1967 buf2.write_string('OR')
1968 } else if ist == ist_char_class_pos {
1969 buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]')
1970 } else if ist == ist_char_class_neg {
1971 buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]')
1972 } else if ist == ist_dot_char {
1973 buf2.write_string('DOT_CHAR')
1974 } else if ist == ist_group_start {
1975 tmp_gi := re.prog[state.pc].group_id
1976 tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep
1977 buf2.write_string('GROUP_START #:${tmp_gi} rep:${tmp_gr} ')
1978 } else if ist == ist_group_end {
1979 buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:${state.group_index}')
1980 }
1981 }
1982 if re.prog[state.pc].rep_max == max_quantifier {
1983 buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}')
1984 } else {
1985 buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}')
1986 }
1987 if re.prog[state.pc].greedy == true {
1988 buf2.write_string('?')
1989 }
1990 buf2.write_string(' (#${state.group_index})')
1991
1992 if ist == ist_dot_char {
1993 buf2.write_string(' last!')
1994 }
1995
1996 buf2.write_string('\n')
1997 }
1998 sss2 := buf2.str()
1999 re.log_func(sss2)
2000 }
2001 }
2002 step_count++
2003 dbg_line++
2004 }
2005 //******************************************
2006
2007 if ist == ist_prog_end {
2008 // println("HERE we end!")
2009 break
2010 }
2011
2012 // we're out of text, manage it
2013 if state.i >= in_txt_len || m_state == .new_line {
2014 // println("Finished text!!")
2015 src_end = true
2016
2017 // we have fished the text, we must manage out pf bound indexes
2018 if state.i >= in_txt_len {
2019 state.i = in_txt_len - 1
2020 }
2021
2022 // manage groups
2023 if state.group_index >= 0 && state.match_index >= 0 {
2024 // println("End text with open groups!")
2025 // println("state.group_index: ${state.group_index}")
2026 // close the groups
2027 for state.group_index >= 0 {
2028 tmp_pc := re.group_data[state.group_index]
2029 re.prog[tmp_pc].group_rep++
2030 // println("Closing group ${state.group_index} {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
2031
2032 if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min
2033 && re.prog[tmp_pc].group_id >= 0 {
2034 start_i := re.group_stack[state.group_index]
2035 re.group_stack[state.group_index] = -1
2036
2037 // save group results
2038 g_index := re.prog[tmp_pc].group_id * 2
2039 // println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
2040 if start_i >= 0 {
2041 re.groups[g_index] = start_i
2042 } else {
2043 re.groups[g_index] = 0
2044 }
2045
2046 re.groups[g_index + 1] = state.i
2047
2048 if re.groups[g_index + 1] >= in_txt_len {
2049 // println("clamp group on stop!")
2050 re.groups[g_index + 1] = in_txt_len - 1
2051 }
2052
2053 // continuous save, save until we have space
2054 re.group_continuous_save(g_index)
2055 }
2056 state.group_index--
2057 }
2058 }
2059
2060 // println("re.groups: ${re.groups}")
2061
2062 // the text is finished and the groups closed and we are the last group, ok exit
2063 if ist == ist_group_end && re.prog[state.pc + 1].ist == ist_prog_end {
2064 // println("Last group end")
2065 return state.first_match, state.i
2066 }
2067
2068 if state.pc == -1 {
2069 state.pc = last_fnd_pc
2070 }
2071
2072 // println("Finished text!!")
2073 // println("Instruction: ${ist:08x} pc: ${state.pc}")
2074 // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}")
2075
2076 // program end
2077 if ist == ist_prog_end {
2078 // println("Program end on end of text!")
2079 return state.first_match, state.i
2080 }
2081
2082 if l_ist in [
2083 u32(ist_char_class_neg),
2084 ist_char_class_pos,
2085 ist_bsls_char,
2086 ist_dot_char,
2087 ] {
2088 // println("***** We have a last special token")
2089 // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
2090 // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
2091 // println("first match: ${state.first_match}")
2092
2093 if re.prog[state.pc].last_dot_flag == true
2094 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min
2095 && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
2096 return state.first_match, state.i
2097 }
2098 // println("Not fitted!!")
2099 }
2100 // no groups open, check the last token quantifier
2101 if ist != ist_group_end && re.prog[state.pc + 1].ist == ist_prog_end {
2102 if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
2103 && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
2104 // println("We are in good repetition")
2105 return state.first_match, state.i
2106 }
2107 }
2108
2109 // println("No good exit!!")
2110 if re.prog[re.prog_len - 1].ist == ist_group_end {
2111 // println("last ist is a group end!")
2112 if re.prog[re.prog_len - 1].group_rep >= re.prog[re.prog_len - 1].rep_min {
2113 return state.first_match, state.i
2114 }
2115 }
2116 return no_match_found, state.i
2117 }
2118
2119 // starting and init
2120 if m_state == .start {
2121 state.pc = -1
2122 state.i = 0
2123 m_state = .ist_next
2124 continue
2125 }
2126 // ist_next, next instruction reseting its state
2127 else if m_state == .ist_next {
2128 state.pc = state.pc + 1
2129 re.prog[state.pc].reset()
2130 // check if we are in the program bounds
2131 if state.pc < 0 || state.pc > re.prog.len {
2132 // println("ERROR!! PC overflow!!")
2133 return err_internal_error, state.i
2134 }
2135 m_state = .ist_load
2136 continue
2137 }
2138 // ist_next_ks, next instruction keeping its state
2139 else if m_state == .ist_next_ks {
2140 state.pc = state.pc + 1
2141 // check if we are in the program bounds
2142 if state.pc < 0 || state.pc > re.prog.len {
2143 // println("ERROR!! PC overflow!!")
2144 return err_internal_error, state.i
2145 }
2146 m_state = .ist_load
2147 continue
2148 }
2149
2150 // load the char
2151 ch, char_len = re.get_charb(in_txt, state.i)
2152
2153 // check new line if flag f_nl enabled
2154 if (re.flag & f_nl) != 0 && char_len == 1 && u8(ch) in new_line_list {
2155 m_state = .new_line
2156 continue
2157 }
2158 // check if stop
2159 else if m_state == .stop {
2160 // we are in search mode, don't exit until the end
2161 if (re.flag & f_src) != 0 && ist != ist_prog_end {
2162 last_fnd_pc = state.pc
2163 state.pc = -1
2164 state.i += char_len
2165
2166 m_state = .ist_next
2167 re.reset_src()
2168 state.match_index = -1
2169 state.first_match = -1
2170
2171 // reset state list
2172 re.reset()
2173
2174 continue
2175 }
2176
2177 if ist == ist_prog_end {
2178 return state.first_match, state.i
2179 }
2180
2181 // manage here dot char
2182
2183 if re.state_list.len > 0 {
2184 // println("Here we are, with stop: state buffer: [${re.state_list.len}]")
2185 state = re.state_list.pop()
2186
2187 state.match_flag = true
2188 l_ist = u32(ist_dot_char)
2189
2190 if state.first_match < 0 {
2191 state.first_match = state.i
2192 }
2193 state.match_index = state.i
2194 re.prog[state.pc].rep++ // increase repetitions
2195
2196 state.i += char_len
2197 m_state = .ist_quant_p
2198 continue
2199 }
2200
2201 // exit on no match
2202 return result, state.i
2203 }
2204 // ist_load
2205 else if m_state == .ist_load {
2206 // program end
2207 if ist == ist_prog_end {
2208 // if we are in match exit well
2209
2210 if state.group_index >= 0 && state.match_index >= 0 {
2211 state.group_index = -1
2212 }
2213
2214 m_state = .stop
2215 continue
2216 }
2217 // check GROUP start, no quantifier is checkd for this token!!
2218 else if ist == ist_group_start {
2219 state.group_index++
2220 re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
2221 re.group_stack[state.group_index] = state.i // index where we start to manage
2222 // println("group_index ${state.group_index} rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
2223
2224 m_state = .ist_next
2225 continue
2226 }
2227 // check GROUP end
2228 else if ist == ist_group_end {
2229 // we are in matching streak
2230 // println("Group END!! last ist: ${l_ist:08x}")
2231 if state.match_index >= 0 {
2232 // restore txt index stack and save the group data
2233
2234 // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
2235 if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
2236 start_i := re.group_stack[state.group_index]
2237
2238 // save group results
2239 g_index := re.prog[state.pc].group_id * 2
2240
2241 if start_i >= 0 {
2242 re.groups[g_index] = start_i
2243 } else {
2244 re.groups[g_index] = 0
2245 }
2246
2247 re.groups[g_index + 1] = state.i
2248
2249 if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] {
2250 re.groups[g_index] = re.groups[g_index - 1]
2251 }
2252
2253 if re.groups[g_index + 1] >= in_txt_len {
2254 // println("clamp group!")
2255 re.groups[g_index + 1] = in_txt_len - 1
2256 }
2257
2258 // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: ${state.i} in_txt_len: ${in_txt_len}")
2259
2260 // continuous save, save until we have space
2261 re.group_continuous_save(g_index)
2262 }
2263
2264 re.prog[state.pc].group_rep++ // increase repetitions
2265 // println("GROUP ${group_index} END ${re.prog[state.pc].group_rep}")
2266 if re.prog[state.pc].group_rep > in_txt_len - 1 {
2267 m_state = .ist_quant_ng
2268 continue
2269 }
2270
2271 m_state = .ist_quant_pg
2272 continue
2273 }
2274
2275 m_state = .ist_quant_ng
2276 continue
2277 }
2278 // check OR
2279 else if ist == ist_or_branch {
2280 if state.match_index >= 0 {
2281 state.pc = re.prog[state.pc].rep_max
2282 // println("ist_or_branch True pc: ${state.pc}")
2283 } else {
2284 state.pc = re.prog[state.pc].rep_min
2285 // println("ist_or_branch False pc: ${state.pc}")
2286 }
2287 re.prog[state.pc].reset()
2288 m_state = .ist_load
2289 continue
2290 }
2291 // check ist_dot_char
2292 else if ist == ist_dot_char {
2293 // println("ist_dot_char rep: ${re.prog[state.pc].rep}")
2294
2295 // check next token to be false
2296 mut next_check_flag := false
2297
2298 // if we are done with max go on dot char are dedicated case!!
2299 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2300 re.state_list.pop()
2301 m_state = .ist_next
2302 continue
2303 }
2304
2305 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].dot_check_pc >= 0
2306 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2307 // load the char
2308 // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2309 ch_t := ch
2310 chk_pc := re.prog[state.pc].dot_check_pc
2311 next_check_flag = re.token_matches(chk_pc, ch_t)
2312 }
2313
2314 // check if we must continue or pass to the next IST
2315 if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end {
2316 // println("save the state!!")
2317 mut dot_state := StateObj{
2318 group_index: state.group_index
2319 match_flag: state.match_flag
2320 match_index: state.match_index
2321 first_match: state.first_match
2322 pc: state.pc
2323 i: state.i + char_len
2324 char_len: char_len
2325 last_dot_pc: state.pc
2326 }
2327 // if we are mananging a .* stay on the same char on return
2328 if re.prog[state.pc].rep_min == 0 {
2329 dot_state.i -= char_len
2330 }
2331
2332 re.state_list << dot_state
2333
2334 m_state = .ist_quant_n
2335 // println("dot_char stack len: ${re.state_list.len}")
2336 continue
2337 }
2338
2339 state.match_flag = true
2340 l_ist = u32(ist_dot_char)
2341
2342 if state.first_match < 0 {
2343 state.first_match = state.i
2344 }
2345 state.match_index = state.i
2346 re.prog[state.pc].rep++ // increase repetitions
2347
2348 state.i += char_len
2349 m_state = .ist_quant_p
2350 continue
2351 }
2352 // char class IST
2353 else if ist == ist_char_class_pos || ist == ist_char_class_neg {
2354 // check next token to be false
2355 mut next_check_flag := false
2356
2357 // if we are done with max go on dot char are dedicated case!!
2358 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2359 re.state_list.pop()
2360 m_state = .ist_next
2361 continue
2362 }
2363
2364 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].cc_check_pc >= 0
2365 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2366 // load the char
2367 // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2368 ch_t := ch
2369 chk_pc := re.prog[state.pc].cc_check_pc
2370 next_check_flag = re.token_matches(chk_pc, ch_t)
2371 }
2372
2373 // check if we must continue or pass to the next IST
2374 if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end {
2375 // println("save the state!!")
2376 mut dot_state := StateObj{
2377 group_index: state.group_index
2378 match_flag: state.match_flag
2379 match_index: state.match_index
2380 first_match: state.first_match
2381 pc: state.pc
2382 i: state.i + char_len
2383 char_len: char_len
2384 last_dot_pc: state.pc
2385 }
2386 // if we are managing a \[something]* stay on the same char on return
2387 if re.prog[state.pc].rep_min == 0 {
2388 dot_state.i -= char_len
2389 }
2390
2391 re.state_list << dot_state
2392
2393 m_state = .ist_quant_n
2394 // println("dot_char stack len: ${re.state_list.len}")
2395 continue
2396 }
2397
2398 // println("HERE WE MUST STAY! ${state.i} >= ${in_txt_len}")
2399
2400 state.match_flag = false
2401 mut cc_neg := false
2402
2403 if ist == ist_char_class_neg {
2404 cc_neg = true
2405 }
2406 mut cc_res := re.check_char_class(state.pc, ch)
2407
2408 // manage out of text on char class parse
2409 if state.i >= (in_txt_len - 1) && cc_neg && re.prog[state.pc].last_dot_flag {
2410 m_state = .ist_quant_n
2411 continue
2412 }
2413
2414 if cc_neg {
2415 cc_res = !cc_res
2416 }
2417
2418 if cc_res {
2419 state.match_flag = true
2420 l_ist = u32(ist_char_class_pos)
2421
2422 if state.first_match < 0 {
2423 state.first_match = state.i
2424 }
2425
2426 state.match_index = state.i
2427
2428 re.prog[state.pc].rep++ // increase repetitions
2429 state.i += char_len // next char
2430 m_state = .ist_quant_p
2431 continue
2432 }
2433 m_state = .ist_quant_n
2434 continue
2435 }
2436 // check bsls
2437 else if ist == ist_bsls_char {
2438 // println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
2439
2440 // check next token to be false
2441 mut next_check_flag := false
2442
2443 // if we are done with max go on dot char are dedicated case!!
2444 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2445 re.state_list.pop()
2446 m_state = .ist_next
2447 continue
2448 }
2449
2450 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].bsls_check_pc >= 0
2451 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2452 // load the char
2453 // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2454 ch_t := ch
2455 chk_pc := re.prog[state.pc].bsls_check_pc
2456 next_check_flag = re.token_matches(chk_pc, ch_t)
2457 }
2458
2459 // check if we must continue or pass to the next IST
2460 if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end {
2461 // println("save the state!!")
2462 mut dot_state := StateObj{
2463 group_index: state.group_index
2464 match_flag: state.match_flag
2465 match_index: state.match_index
2466 first_match: state.first_match
2467 pc: state.pc
2468 i: state.i + char_len
2469 char_len: char_len
2470 last_dot_pc: state.pc
2471 }
2472 // if we are managing a \[something]* stay on the same char on return
2473 if re.prog[state.pc].rep_min == 0 {
2474 dot_state.i -= char_len
2475 }
2476
2477 re.state_list << dot_state
2478
2479 m_state = .ist_quant_n
2480 // println("dot_char stack len: ${re.state_list.len}")
2481 continue
2482 }
2483
2484 tmp_res := re.validator_matches(re.prog[state.pc].validator, ch)
2485 if tmp_res == false {
2486 m_state = .ist_quant_n
2487 continue
2488 }
2489 // println("${ch} => ${tmp_res}")
2490
2491 state.match_flag = true
2492 l_ist = u32(ist_dot_char)
2493
2494 if state.first_match < 0 {
2495 state.first_match = state.i
2496 }
2497 state.match_index = state.i
2498 re.prog[state.pc].rep++ // increase repetitions
2499
2500 state.i += char_len
2501 m_state = .ist_quant_p
2502 continue
2503 }
2504 // simple char IST
2505 else if ist == ist_simple_char {
2506 // println("ist_simple_char")
2507 state.match_flag = false
2508
2509 if re.chars_equal(re.prog[state.pc].ch, ch)
2510 && (state.i < in_txt_len - 1 || re.prog[state.pc].ch != 0) {
2511 state.match_flag = true
2512 l_ist = ist_simple_char
2513
2514 if state.first_match < 0 {
2515 state.first_match = state.i
2516 }
2517 // println("state.match_index: ${state.match_index}")
2518 state.match_index = state.i
2519
2520 re.prog[state.pc].rep++ // increase repetitions
2521 state.i += char_len // next char
2522 m_state = .ist_quant_p
2523 continue
2524 }
2525 m_state = .ist_quant_n
2526 continue
2527 }
2528 // UNREACHABLE
2529 // println("PANIC2!! state: ${m_state}")
2530 return err_internal_error, state.i
2531 }
2532 /***********************************
2533 * Quantifier management
2534 ***********************************/
2535 // ist_quant_ng => quantifier negative test on group
2536 else if m_state == .ist_quant_ng {
2537 // we are finished here
2538 if state.group_index < 0 {
2539 // println("Early stop!")
2540 result = no_match_found
2541 m_state = .stop
2542 continue
2543 }
2544
2545 tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
2546 rep := re.prog[tmp_pc].group_rep // use a temp variable
2547 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2548
2549 // println(".ist_quant_ng group_pc_end: ${tmp_pc} rep: ${rep}")
2550
2551 if rep >= re.prog[tmp_pc].rep_min {
2552 // println("ist_quant_ng GROUP CLOSED OK group_index: ${state.group_index}")
2553
2554 state.i = re.group_stack[state.group_index]
2555 state.pc = tmp_pc
2556 state.group_index--
2557 m_state = .ist_next
2558 continue
2559 } else if re.prog[tmp_pc].next_is_or {
2560 // println("ist_quant_ng OR Negative branch")
2561
2562 state.i = re.group_stack[state.group_index]
2563 state.pc = re.prog[tmp_pc + 1].rep_min - 1
2564 state.group_index--
2565 m_state = .ist_next
2566 continue
2567 } else if rep > 0 && rep < re.prog[tmp_pc].rep_min {
2568 // println("ist_quant_ng UNDER THE MINIMUM g.i: ${state.group_index}")
2569
2570 // check if we are inside a group, if yes exit from the nested groups
2571 if state.group_index > 0 {
2572 state.group_index--
2573 state.pc = tmp_pc
2574 m_state = .ist_quant_ng //.ist_next
2575 continue
2576 }
2577
2578 if state.group_index == 0 {
2579 state.group_index--
2580 state.pc = tmp_pc // TEST
2581 m_state = .ist_next
2582 continue
2583 }
2584
2585 result = no_match_found
2586 m_state = .stop
2587 continue
2588 } else if rep == 0 && rep < re.prog[tmp_pc].rep_min {
2589 // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: ${state.group_index}")
2590
2591 if state.group_index > 0 {
2592 state.group_index--
2593 state.pc = tmp_pc
2594 m_state = .ist_quant_ng //.ist_next
2595 continue
2596 }
2597
2598 result = no_match_found
2599 m_state = .stop
2600 continue
2601 }
2602
2603 // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${rep}")
2604 // UNREACHABLE
2605 return err_internal_error, state.i
2606 }
2607 // ist_quant_pg => quantifier positive test on group
2608 else if m_state == .ist_quant_pg {
2609 // println(".ist_quant_pg")
2610 mut tmp_pc := state.pc
2611 if state.group_index >= 0 {
2612 tmp_pc = re.group_data[state.group_index]
2613 }
2614
2615 if re.prog[tmp_pc].group_neg == true {
2616 // println("***** Negation of the group")
2617 result = no_match_found
2618 m_state = .stop
2619 continue
2620 }
2621
2622 rep := re.prog[tmp_pc].group_rep
2623
2624 if rep < re.prog[tmp_pc].rep_min {
2625 // println("ist_quant_pg UNDER RANGE")
2626 state.pc = re.prog[tmp_pc].goto_pc
2627 m_state = .ist_next
2628 continue
2629 } else if rep == re.prog[tmp_pc].rep_max {
2630 // println("ist_quant_pg MAX RANGE")
2631 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2632 state.group_index--
2633 m_state = .ist_next
2634
2635 continue
2636 } else if rep >= re.prog[tmp_pc].rep_min {
2637 // println("ist_quant_pg IN RANGE group_index:${state.group_index}")
2638
2639 // check greedy flag, if true exit on minimum
2640 if re.prog[tmp_pc].greedy == true {
2641 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2642 state.group_index--
2643 m_state = .ist_next
2644 continue
2645 }
2646
2647 state.pc = re.prog[tmp_pc].goto_pc - 1
2648 state.group_index--
2649 m_state = .ist_next
2650 continue
2651 }
2652
2653 // UNREACHABLE
2654 // println("PANIC3!! state: ${m_state}")
2655 return err_internal_error, state.i
2656 }
2657 // ist_quant_n => quantifier negative test on token
2658 else if m_state == .ist_quant_n {
2659 rep := re.prog[state.pc].rep
2660 // println("Here!! PC ${state.pc} is_next_or: ${re.prog[state.pc].next_is_or}")
2661
2662 // zero quantifier * or ?
2663 if rep == 0 && re.prog[state.pc].rep_min == 0 {
2664 // println("ist_quant_n c_zero RANGE MIN")
2665 m_state = .ist_next // go to next ist
2666 continue
2667 }
2668 // match + or *
2669 else if rep >= re.prog[state.pc].rep_min {
2670 // println("ist_quant_n MATCH RANGE")
2671 m_state = .ist_next
2672 continue
2673 }
2674
2675 // check the OR if present
2676 if re.prog[state.pc].next_is_or {
2677 // println("OR present on failing")
2678 state.match_index = -1
2679 m_state = .ist_next
2680 continue
2681 }
2682
2683 // we are in a group manage no match from here
2684 if state.group_index >= 0 {
2685 // println("ist_quant_n FAILED insied a GROUP group_index:${state.group_index}")
2686 m_state = .ist_quant_ng
2687 continue
2688 }
2689
2690 // no other options
2691 // println("ist_quant_n no_match_found")
2692 result = no_match_found
2693 m_state = .stop
2694
2695 // stop already started matching outside a capturing group
2696 if re.state_list.len > 0 && re.state_list.last().group_index == -1
2697 && re.state_list.last().last_dot_pc > 0 {
2698 if ist == ist_dot_char || ist == ist_bsls_char {
2699 return no_match_found, 0
2700 }
2701 }
2702 continue
2703 }
2704 // ist_quant_p => quantifier positive test on token
2705 else if m_state == .ist_quant_p {
2706 // println("Here .ist_quant_p")
2707 // exit on first match
2708 if (re.flag & f_efm) != 0 {
2709 return state.i, state.i + 1
2710 }
2711
2712 rep := re.prog[state.pc].rep
2713 // println("ist_quant_p rep: ${rep} rep_min: ${re.prog[state.pc].rep_min}")
2714
2715 // under range
2716 if rep > 0 && rep < re.prog[state.pc].rep_min {
2717 // println("ist_quant_p UNDER RANGE")
2718 m_state = .ist_load // continue the loop
2719 continue
2720 }
2721 // range ok, continue loop
2722 else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max {
2723 // println("ist_quant_p IN RANGE")
2724
2725 // check greedy flag, if true exit on minimum
2726 if re.prog[state.pc].greedy == true {
2727 m_state = .ist_next
2728 continue
2729 }
2730 m_state = .ist_load
2731 continue
2732 }
2733 // max reached
2734 else if rep == re.prog[state.pc].rep_max {
2735 // println("ist_quant_p MAX RANGE")
2736 m_state = .ist_next
2737 continue
2738 }
2739 }
2740 // UNREACHABLE
2741 // println("PANIC4!! state: ${m_state}")
2742 return err_internal_error, state.i
2743 }
2744
2745 // println("Check end of text!")
2746 // Check the results
2747 if ist == ist_prog_end && state.first_match < 0 {
2748 return state.i, state.i
2749 }
2750 if state.match_index >= 0 {
2751 if state.group_index < 0 {
2752 if re.prog[state.pc].ist == ist_prog_end {
2753 // println("program ended!!")
2754
2755 if (re.flag & f_src) != 0 {
2756 // println("find return")
2757 return state.first_match, state.i
2758 } else {
2759 // println("Here!!")
2760 return 0, state.i
2761 }
2762 }
2763
2764 // println("No Group here, natural end [${state.first_match},${state.i}] state: ${state_str(m_state)} ist: ${ist} pgr_end: ${re.prog.len}")
2765
2766 if re.prog[state.pc + 1].ist == ist_prog_end || re.prog[state.pc].ist == ist_prog_end {
2767 rep := re.prog[state.pc].rep
2768 // println("rep: ${rep} re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}")
2769 if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max {
2770 return state.first_match, state.i
2771 }
2772 // println("Program not finished! ")
2773 return no_match_found, state.i
2774 }
2775 if src_end {
2776 // println("program end")
2777 return state.first_match, state.i
2778 }
2779 // print("No match found!!")
2780 return no_match_found, state.i
2781 } else {
2782 // println("Group match! OK")
2783 // println("first_match: ${state.first_match}, i: ${state.i}")
2784
2785 // println("Skip last group")
2786 return state.first_match, state.i
2787 // return state.first_match,re.group_stack[state.group_index--]
2788 }
2789 }
2790 // println("no_match_found, natural end")
2791 return no_match_found, state.i
2792}
2793