Gitly


1 /*
2 regex 1.0 alpha
3 
4 Copyright (c) 2019-2024 Dario Deledda. All rights reserved.
5 Use of this source code is governed by an MIT license
6 that can be found in the LICENSE file.
7 
8 This file contains regex module
9 
10 Know limitation:
11 - find is implemented in a trivial way
12 - not full compliant PCRE
13 - not compliant POSIX ERE
14 */
15 module regex
16 
17 import strings
18 
19 pub const v_regex_version = '1.0 alpha' // regex module version
20 pub const max_code_len = 256 // default small base code len for the regex programs
21 pub const max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
22 
23 // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
24 pub const spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
25 // new line chars for now only '\n'
26 pub const new_line_list = [`\n`, `\r`]
27 
28 // Results
29 pub const no_match_found = -1
30 
31 // Errors
32 pub const compile_ok = 0 // the regex string compiled, all ok
33 pub const err_char_unknown = -2 // the char used is unknow to the system
34 pub const err_undefined = -3 // the compiler symbol is undefined
35 pub const err_internal_error = -4 // Bug in the regex system!!
36 pub const err_cc_alloc_overflow = -5 // memory for char class full!!
37 pub const err_syntax_error = -6 // syntax error in regex compiling
38 pub const err_groups_overflow = -7 // max number of groups reached
39 pub const err_groups_max_nested = -8 // max number of nested group reached
40 pub const err_group_not_balanced = -9 // group not balanced
41 pub const err_group_qm_notation = -10 // group invalid notation
42 pub const err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
43 pub const err_neg_group_quantifier = -12 // negation groups can not have quantifier
44 pub const err_consecutive_dots = -13
45 
46 //*************************************
47 // regex program instructions
48 //*************************************
49 const ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
50 
51 // char class 11 0100 AA xxxxxxxx
52 // AA = 00  regular class
53 // AA = 01  Negated class ^ char
54 const ist_char_class = u32(0xD1000000) // MASK
55 const ist_char_class_pos = u32(0xD0000000) // char class normal [abc]
56 const ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]
57 
58 // dot char        10 0110 xx xxxxxxxx
59 const ist_dot_char = u32(0x98000000) // match any char except \n
60 
61 // backslash chars 10 0100 xx xxxxxxxx
62 const ist_bsls_char = u32(0x90000000) // backslash char
63 
64 // OR |            10 010Y xx xxxxxxxx
65 const ist_or_branch = u32(0x91000000) // OR case
66 
67 // groups          10 010Y xx xxxxxxxx
68 const ist_group_start = u32(0x92000000) // group start (
69 const ist_group_end = u32(0x94000000) // group end   )
70 
71 // control instructions
72 const ist_prog_end = u32(0x88000000)
73 
74 /*
75 General Utilities
76 */
77 // utf8util_char_len calculate the length in bytes of a utf8 char
78 @[inline]
79 fn utf8util_char_len(b u8) int {
80     return int(((u32(0xe5000000) >> ((b >> 3) & 0x1e)) & 3) + 1)
81 }
82 
83 // get_char get a char from position i and return an u32 with the unicode code
84 @[direct_array_access; inline]
85 fn (re &RE) get_char(in_txt string, i int) (u32, int) {
86     ini := unsafe { in_txt.str[i] }
87     // ascii 8 bit
88     if (re.flag & f_bin) != 0 || ini & 0x80 == 0 {
89         return u32(ini), 1
90     }
91     // unicode char
92     char_len := utf8util_char_len(ini)
93     mut tmp := 0
94     mut ch := u32(0)
95     for tmp < char_len {
96         ch = (ch << 8) | unsafe { in_txt.str[i + tmp] }
97         tmp++
98     }
99     return ch, char_len
100 }
101 
102 // get_charb get a char from position i and return an u32 with the unicode code
103 @[direct_array_access; inline]
104 fn (re &RE) get_charb(in_txt &u8, i int) (u32, int) {
105     // ascii 8 bit
106     if (re.flag & f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 {
107         return u32(unsafe { in_txt[i] }), 1
108     }
109     // unicode char
110     char_len := utf8util_char_len(unsafe { in_txt[i] })
111     mut tmp := 0
112     mut ch := u32(0)
113     for tmp < char_len {
114         ch = (ch << 8) | unsafe { in_txt[i + tmp] }
115         tmp++
116     }
117     return ch, char_len
118 }
119 
120 @[inline]
121 fn is_alnum(in_char u8) bool {
122     mut tmp := in_char - `A`
123     if tmp <= 25 {
124         return true
125     }
126     tmp = in_char - `a`
127     if tmp <= 25 {
128         return true
129     }
130     tmp = in_char - `0`
131     if tmp <= 9 {
132         return true
133     }
134     if in_char == `_` {
135         return true
136     }
137     return false
138 }
139 
140 @[inline]
141 fn is_not_alnum(in_char u8) bool {
142     return !is_alnum(in_char)
143 }
144 
145 @[inline]
146 fn is_space(in_char u8) bool {
147     return in_char in spaces
148 }
149 
150 @[inline]
151 fn is_not_space(in_char u8) bool {
152     return !is_space(in_char)
153 }
154 
155 @[inline]
156 fn is_digit(in_char u8) bool {
157     tmp := in_char - `0`
158     return tmp <= 0x09
159 }
160 
161 @[inline]
162 fn is_not_digit(in_char u8) bool {
163     return !is_digit(in_char)
164 }
165 
166 /*
167 @[inline]
168 fn is_wordchar(in_char byte) bool {
169     return is_alnum(in_char) || in_char == `_`
170 }
171 
172 @[inline]
173 fn is_not_wordchar(in_char byte) bool {
174     return !is_alnum(in_char)
175 }
176 */
177 
178 @[inline]
179 fn is_lower(in_char u8) bool {
180     tmp := in_char - `a`
181     return tmp <= 25
182 }
183 
184 @[inline]
185 fn is_upper(in_char u8) bool {
186     tmp := in_char - `A`
187     return tmp <= 25
188 }
189 
190 pub fn (re &RE) get_parse_error_string(err int) string {
191     match err {
192         compile_ok { return 'compile_ok' }
193         no_match_found { return 'no_match_found' }
194         err_char_unknown { return 'err_char_unknown' }
195         err_undefined { return 'err_undefined' }
196         err_internal_error { return 'err_internal_error' }
197         err_cc_alloc_overflow { return 'err_cc_alloc_overflow' }
198         err_syntax_error { return 'err_syntax_error' }
199         err_groups_overflow { return 'err_groups_overflow' }
200         err_groups_max_nested { return 'err_groups_max_nested' }
201         err_group_not_balanced { return 'err_group_not_balanced' }
202         err_group_qm_notation { return 'err_group_qm_notation' }
203         err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
204         err_neg_group_quantifier { return 'err_neg_group_quantifier' }
205         err_consecutive_dots { return 'err_consecutive_dots' }
206         else { return 'err_unknown' }
207     }
208 }
209 
210 // utf8_str convert and utf8 sequence to a printable string
211 @[inline]
212 fn utf8_str(ch rune) string {
213     mut i := 4
214     mut res := ''
215     for i > 0 {
216         v := u8((ch >> ((i - 1) * 8)) & 0xFF)
217         if v != 0 {
218             res += '${v:1c}'
219         }
220         i--
221     }
222     return res
223 }
224 
225 // simple_log default log function
226 fn simple_log(txt string) {
227     print(txt)
228 }
229 
230 /******************************************************************************
231 *
232 * Token Structs
233 *
234 ******************************************************************************/
235 pub type FnValidator = fn (u8) bool
236 
237 struct Token {
238 mut:
239     ist u32
240     // char
241     ch     rune // char of the token if any
242     ch_len u8   // char len
243     flag   u8   // flag for general usage
244     // Quantifiers / branch
245     rep_min int  // used also for jump next in the OR branch [no match] pc jump
246     rep_max int  // used also for jump next in the OR branch [   match] pc jump
247     greedy  bool // greedy quantifier flag
248     // Char class
249     cc_index int = -1
250     // counters for quantifier check (repetitions)
251     rep int
252     // validator function pointer
253     validator FnValidator = unsafe { nil }
254     // groups variables
255     group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
256     group_rep int  // repetition of the group
257     group_id  int = -1 // id of the group
258     goto_pc   int = -1 // jump to this PC if is needed
259     // OR flag for the token
260     next_is_or bool // true if the next token is an OR
261     // dot_char token variables
262     dot_check_pc  int = -1 // pc of the next token to check for dots
263     bsls_check_pc int = -1 // pc of the next token to check for bsls
264     cc_check_pc   int = -1 // pc of the next token to check for CC
265     last_dot_flag bool // if true indicate that is the last dot_char in the regex
266     // debug fields
267     source_index int
268 }
269 
270 @[inline]
271 fn (mut tok Token) reset() {
272     tok.rep = 0
273 }
274 
275 /******************************************************************************
276 *
277 * Regex struct
278 *
279 ******************************************************************************/
280 pub const f_nl = 0x00000001 // end the match when find a new line symbol
281 pub const f_ms = 0x00000002 // match true only if the match is at the start of the string
282 pub const f_me = 0x00000004 // match true only if the match is at the end of the string
283 pub const f_efm = 0x00000100 // exit on first token matched, used by search
284 pub const f_bin = 0x00000200 // work only on bytes, ignore utf-8
285 pub const f_ci = 0x00000400 // compare ASCII letters without case sensitivity
286 
287 // behaviour modifier flags
288 pub const f_src = 0x00020000
289 
290 // Log function prototype
291 pub type FnLog = fn (string)
292 
293 pub struct RE {
294 pub mut:
295     prog     []Token
296     prog_len int // regex program len
297     // char classes storage
298     cc       []CharClass // char class list
299     cc_index int         // index
300     // groups
301     group_count      int   // number of groups in this regex struct
302     groups           []int // groups index results
303     group_max_nested int = 3 // max nested group
304     group_max        int = 8 // max allowed number of different groups
305 
306     state_list []StateObj
307 
308     group_csave_flag bool  // flag to enable continuous saving
309     group_csave      []int //= []int{}  // groups continuous save list
310 
311     group_map map[string]int // groups names map
312 
313     group_stack []int
314     group_data  []int
315     // flags
316     flag int // flag for optional parameters
317     // Debug/log
318     debug    int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
319     log_func FnLog = simple_log // log function, can be customized by the user
320     query    string // query string
321 }
322 
323 // Reset RE object
324 @[direct_array_access; inline]
325 pub fn (mut re RE) reset() {
326     re.cc_index = 0
327 
328     mut i := 0
329     for i < re.prog_len {
330         re.prog[i].group_rep = 0 // clear repetition of the group
331         re.prog[i].rep = 0 // clear repetition of the token
332         i++
333     }
334 
335     // init groups array
336     if re.group_count > 0 {
337         if re.groups.len == 0 {
338             // first run alloc memory
339             re.groups = []int{len: re.group_count * 2, init: -1}
340         } else {
341             // subsequent executions, only clean up the memory
342             i = 0
343             for i < re.groups.len {
344                 re.groups[i] = -1
345                 i++
346             }
347         }
348     }
349 
350     // reset group_csave
351     if re.group_csave_flag == true {
352         re.group_csave.clear() // = []int{}
353     }
354 
355     // reset state list
356     re.state_list.clear()
357     // restore initial state of the stack
358     for mut x in re.group_stack {
359         x = -1
360     }
361 }
362 
363 // reset for search mode fail
364 // gcc bug, dont use @[inline] or go 5 time slower
365 //[inline]
366 @[direct_array_access]
367 fn (mut re RE) reset_src() {
368     mut i := 0
369     for i < re.prog_len {
370         re.prog[i].group_rep = 0 // clear repetition of the group
371         re.prog[i].rep = 0 // clear repetition of the token
372         i++
373     }
374 }
375 
376 /******************************************************************************
377 *
378 * Backslashes chars
379 *
380 ******************************************************************************/
381 struct BslsStruct {
382     ch        rune // meta char
383     validator FnValidator = unsafe { nil } // validator function pointer
384 }
385 
386 const bsls_validator_array = [
387     BslsStruct{`w`, is_alnum},
388     BslsStruct{`W`, is_not_alnum},
389     BslsStruct{`s`, is_space},
390     BslsStruct{`S`, is_not_space},
391     BslsStruct{`d`, is_digit},
392     BslsStruct{`D`, is_not_digit},
393     BslsStruct{`a`, is_lower},
394     BslsStruct{`A`, is_upper},
395 ]
396 
397 // these chars are escape if preceded by a \
398 const bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`,
399     `^`, `!`]
400 
401 enum BSLS_parse_state {
402     start
403     bsls_found
404     bsls_char
405     normal_char
406     hex_char
407 }
408 
409 // parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
410 fn (re &RE) parse_bsls(in_txt string, in_i int) (int, int, u32) {
411     mut status := BSLS_parse_state.start
412     mut i := in_i
413     mut hex_max_len := 2
414     mut hex_res := u32(0)
415     mut hex_count := 0
416 
417     for i < in_txt.len {
418         // get our char
419         char_tmp, char_len := re.get_char(in_txt, i)
420         ch := u8(char_tmp)
421         // println("ch [${ch:c}]")
422 
423         if status == .start && ch == `\\` {
424             status = .bsls_found
425             i += char_len
426             continue
427         }
428 
429         // check if is our bsls char, for now only one length sequence
430         if status == .bsls_found {
431             for c, x in bsls_validator_array {
432                 if x.ch == ch {
433                     return c, i - in_i + 1, hex_res
434                 }
435             }
436 
437             // check for \x00 hex 8bit
438             if ch == `x` {
439                 status = .hex_char
440                 hex_max_len = 2
441                 i += char_len
442                 continue
443             }
444 
445             // check for \x00 hex 16bit
446             if ch == `X` {
447                 status = .hex_char
448                 hex_max_len = 4
449                 i += char_len
450                 continue
451             }
452 
453             status = .normal_char
454             continue
455         }
456 
457         // manage hex byte
458         if status == .hex_char {
459             if ch >= `0` && ch <= `9` {
460                 hex_count++
461                 hex_res <<= 4
462                 hex_res += u32(ch - `0`)
463                 i += char_len
464             } else if ch >= `A` && ch <= `F` {
465                 hex_count++
466                 hex_res <<= 4
467                 hex_res += u32(ch - `A` + 10)
468                 i += char_len
469             } else if ch >= `a` && ch <= `f` {
470                 hex_count++
471                 hex_res <<= 4
472                 hex_res += u32(ch - `a` + 10)
473                 i += char_len
474             } else {
475                 return err_syntax_error, i, hex_res
476             }
477 
478             // println("hex_res: ${hex_res:08x} hex_count: ${hex_count}")
479 
480             // look for more hex digits
481             if hex_count < hex_max_len {
482                 continue
483             }
484 
485             // if over 8 nibble is more than 32 bit, error
486             if hex_count > hex_max_len {
487                 return err_syntax_error, i - in_i, hex_res
488             }
489 
490             if hex_count == hex_max_len {
491                 // we have a good result
492                 // println("RESULT hex_res: ${hex_res:08x} hex_count: ${hex_count}")
493                 return -2, i - in_i, hex_res
494             }
495 
496             // MUST NOT BE HERE!
497             return err_syntax_error, i, hex_res
498         }
499 
500         // no BSLS validator, manage as normal escape char char
501         if status == .normal_char {
502             if ch in bsls_escape_list {
503                 return no_match_found, i - in_i + 1, hex_res
504             }
505             return err_syntax_error, i - in_i + 1, hex_res
506         }
507 
508         // at the present time we manage only one char after the \
509         break
510     }
511     // not our bsls return KO
512     return err_syntax_error, i, hex_res
513 }
514 
515 /******************************************************************************
516 *
517 * Char class
518 *
519 ******************************************************************************/
520 const cc_null = 0 // empty cc token
521 
522 const cc_char = 1 // simple char: a
523 
524 const cc_int = 2 // char interval: a-z
525 
526 const cc_bsls = 3 // backslash char
527 
528 const cc_end = 4
529 
530 struct CharClass {
531 mut:
532     cc_type   int  // type of cc token
533     ch0       rune // first char of the interval a-b  a in this case
534     ch1       rune // second char of the interval a-b b in this case
535     validator FnValidator = unsafe { nil } // validator function pointer
536 }
537 
538 enum CharClass_parse_state {
539     start
540     in_char
541     in_bsls
542     separator
543     finish
544 }
545 
546 fn (re &RE) get_char_class(pc int) string {
547     buf := []u8{len: (re.cc.len)}
548     mut buf_ptr := unsafe { &u8(&buf) }
549 
550     mut cc_i := re.prog[pc].cc_index
551     mut i := 0
552     mut tmp := 0
553     for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end {
554         if re.cc[cc_i].cc_type == cc_bsls {
555             unsafe {
556                 buf_ptr[i] = `\\`
557                 i++
558                 buf_ptr[i] = u8(re.cc[cc_i].ch0)
559                 i++
560             }
561         } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 {
562             tmp = 3
563             for tmp >= 0 {
564                 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
565                 if x != 0 {
566                     unsafe {
567                         buf_ptr[i] = x
568                         i++
569                     }
570                 }
571                 tmp--
572             }
573         } else {
574             tmp = 3
575             for tmp >= 0 {
576                 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
577                 if x != 0 {
578                     unsafe {
579                         buf_ptr[i] = x
580                         i++
581                     }
582                 }
583                 tmp--
584             }
585             unsafe {
586                 buf_ptr[i] = `-`
587                 i++
588             }
589             tmp = 3
590             for tmp >= 0 {
591                 x := u8((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF)
592                 if x != 0 {
593                     unsafe {
594                         buf_ptr[i] = x
595                         i++
596                     }
597                 }
598                 tmp--
599             }
600         }
601         cc_i++
602     }
603     unsafe {
604         buf_ptr[i] = u8(0)
605     }
606     return unsafe { tos_clone(buf_ptr) }
607 }
608 
609 fn (re &RE) check_char_class(pc int, ch rune) bool {
610     mut cc_i := re.prog[pc].cc_index
611     for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != cc_end {
612         if re.cc[cc_i].cc_type == cc_bsls {
613             if re.validator_matches(re.cc[cc_i].validator, ch) {
614                 return true
615             }
616         } else if re.char_in_range(ch, re.cc[cc_i].ch0, re.cc[cc_i].ch1) {
617             return true
618         }
619         cc_i++
620     }
621     return false
622 }
623 
624 @[inline]
625 fn (re &RE) chars_equal(a rune, b rune) bool {
626     if a == b {
627         return true
628     }
629     if (re.flag & f_ci) == 0 {
630         return false
631     }
632     lower_a := re.case_transform_char(a, false)
633     lower_b := re.case_transform_char(b, false)
634     if lower_a == lower_b {
635         return true
636     }
637     return re.case_transform_char(a, true) == re.case_transform_char(b, true)
638 }
639 
640 @[inline]
641 fn (re &RE) char_in_range(ch rune, start rune, end rune) bool {
642     if ch >= start && ch <= end {
643         return true
644     }
645     if (re.flag & f_ci) == 0 {
646         return false
647     }
648     lower := re.case_transform_char(ch, false)
649     if lower != ch && lower >= start && lower <= end {
650         return true
651     }
652     upper := re.case_transform_char(ch, true)
653     return upper != ch && upper >= start && upper <= end
654 }
655 
656 @[inline]
657 fn (re &RE) validator_matches(validator FnValidator, ch rune) bool {
658     if validator(u8(ch)) {
659         return true
660     }
661     if (re.flag & f_ci) == 0 {
662         return false
663     }
664     lower := re.case_transform_char(ch, false)
665     if lower != ch && validator(u8(lower)) {
666         return true
667     }
668     upper := re.case_transform_char(ch, true)
669     return upper != ch && validator(u8(upper))
670 }
671 
672 fn (re &RE) case_transform_char(ch rune, upper bool) rune {
673     if upper {
674         if ch >= `a` && ch <= `z` {
675             return ch - 32
676         }
677         return ch
678     }
679     if ch >= `A` && ch <= `Z` {
680         return ch + 32
681     }
682     return ch
683 }
684 
685 @[inline]
686 fn (re &RE) token_matches(pc int, ch rune) bool {
687     return match re.prog[pc].ist {
688         ist_simple_char { re.chars_equal(re.prog[pc].ch, ch) }
689         ist_char_class_pos { re.check_char_class(pc, ch) }
690         ist_char_class_neg { !re.check_char_class(pc, ch) }
691         ist_bsls_char { re.validator_matches(re.prog[pc].validator, ch) }
692         else { false }
693     }
694 }
695 
696 // parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char
697 fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, u32) {
698     mut status := CharClass_parse_state.start
699     mut i := in_i
700 
701     mut tmp_index := re.cc_index
702     res_index := re.cc_index
703 
704     mut cc_type := u32(ist_char_class_pos)
705 
706     for i < in_txt.len {
707         // check if we are out of memory for char classes
708         if tmp_index >= re.cc.len {
709             return err_cc_alloc_overflow, 0, u32(0)
710         }
711 
712         // get our char
713         char_tmp, char_len := re.get_char(in_txt, i)
714         ch := u8(char_tmp)
715 
716         // println("CC #${i:3d} ch: ${ch:c}")
717 
718         // negation
719         if status == .start && ch == `^` {
720             cc_type = u32(ist_char_class_neg)
721             i += char_len
722             continue
723         }
724 
725         // minus symbol
726         if status == .start && ch == `-` {
727             re.cc[tmp_index].cc_type = cc_char
728             re.cc[tmp_index].ch0 = char_tmp
729             re.cc[tmp_index].ch1 = char_tmp
730             i += char_len
731             tmp_index++
732             continue
733         }
734 
735         // bsls
736         if (status == .start || status == .in_char) && ch == `\\` {
737             // println("CC bsls.")
738             status = .in_bsls
739             i += char_len
740             continue
741         }
742 
743         if status == .in_bsls {
744             // println("CC bsls validation.")
745             for c, x in bsls_validator_array {
746                 if x.ch == ch {
747                     // println("CC bsls found [${ch:c}]")
748                     re.cc[tmp_index].cc_type = cc_bsls
749                     re.cc[tmp_index].ch0 = bsls_validator_array[c].ch
750                     re.cc[tmp_index].ch1 = bsls_validator_array[c].ch
751                     re.cc[tmp_index].validator = bsls_validator_array[c].validator
752                     i += char_len
753                     tmp_index++
754                     status = .in_char
755                     break
756                 }
757             }
758             if status == .in_bsls {
759                 // manage as a simple char
760                 // println("CC bsls not found [${ch:c}]")
761                 re.cc[tmp_index].cc_type = cc_char
762                 re.cc[tmp_index].ch0 = char_tmp
763                 re.cc[tmp_index].ch1 = char_tmp
764                 i += char_len
765                 tmp_index++
766                 status = .in_char
767                 continue
768             } else {
769                 continue
770             }
771         }
772 
773         // simple char
774         if (status == .start || status == .in_char) && ch != `-` && ch != `]` {
775             status = .in_char
776 
777             re.cc[tmp_index].cc_type = cc_char
778             re.cc[tmp_index].ch0 = char_tmp
779             re.cc[tmp_index].ch1 = char_tmp
780 
781             i += char_len
782             tmp_index++
783             continue
784         }
785 
786         // check range separator
787         if status == .in_char && ch == `-` {
788             status = .separator
789             i += char_len
790             continue
791         }
792 
793         // check range end
794         if status == .separator && ch != `]` && ch != `-` {
795             status = .in_char
796             re.cc[tmp_index - 1].cc_type = cc_int
797             re.cc[tmp_index - 1].ch1 = char_tmp
798             i += char_len
799             continue
800         }
801 
802         // char class end
803         if status == .in_char && ch == `]` {
804             re.cc[tmp_index].cc_type = cc_end
805             re.cc[tmp_index].ch0 = 0
806             re.cc[tmp_index].ch1 = 0
807             re.cc_index = tmp_index + 1
808 
809             return res_index, i - in_i + 2, cc_type
810         }
811 
812         i++
813     }
814     return err_syntax_error, 0, u32(0)
815 }
816 
817 /******************************************************************************
818 *
819 * Re Compiler
820 *
821 ******************************************************************************/
822 //
823 // Quantifier
824 //
825 enum Quant_parse_state {
826     start
827     min_parse
828     comma_checked
829     max_parse
830     greedy
831     gredy_parse
832     finish
833 }
834 
835 // parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
836 fn (re &RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
837     mut status := Quant_parse_state.start
838     mut i := in_i
839 
840     mut q_min := 0 // default min in a {} quantifier is 1
841     mut q_max := 0 // default max in a {} quantifier is max_quantifier
842 
843     mut ch := u8(0)
844 
845     for i < in_txt.len {
846         unsafe {
847             ch = in_txt.str[i]
848         }
849         // println("${ch:c} status: ${status}")
850 
851         // exit on no compatible char with {} quantifier
852         if utf8util_char_len(ch) != 1 {
853             return err_syntax_error, i, 0, false
854         }
855 
856         // min parsing skip if comma present
857         if status == .start && ch == `,` {
858             q_min = 0 // default min in a {} quantifier is 0
859             status = .comma_checked
860             i++
861             continue
862         }
863 
864         if status == .start && is_digit(ch) {
865             status = .min_parse
866             q_min *= 10
867             q_min += int(ch - `0`)
868             i++
869             continue
870         }
871 
872         if status == .min_parse && is_digit(ch) {
873             q_min *= 10
874             q_min += int(ch - `0`)
875             i++
876             continue
877         }
878 
879         // we have parsed the min, now check the max
880         if status == .min_parse && ch == `,` {
881             status = .comma_checked
882             i++
883             continue
884         }
885 
886         // single value {4}
887         if status == .min_parse && ch == `}` {
888             q_max = q_min
889             status = .greedy
890             continue
891         }
892 
893         // end without max
894         if status == .comma_checked && ch == `}` {
895             q_max = max_quantifier
896             status = .greedy
897             continue
898         }
899 
900         // start max parsing
901         if status == .comma_checked && is_digit(ch) {
902             status = .max_parse
903             q_max *= 10
904             q_max += int(ch - `0`)
905             i++
906             continue
907         }
908 
909         // parse the max
910         if status == .max_parse && is_digit(ch) {
911             q_max *= 10
912             q_max += int(ch - `0`)
913             i++
914             continue
915         }
916 
917         // finished the quantifier
918         if status == .max_parse && ch == `}` {
919             status = .greedy
920             continue
921         }
922 
923         // check if greedy flag char ? is present
924         if status == .greedy {
925             if i + 1 < in_txt.len {
926                 i++
927                 status = .gredy_parse
928                 continue
929             }
930             return q_min, q_max, i - in_i + 2, false
931         }
932 
933         // check the greedy flag
934         if status == .gredy_parse {
935             if ch == `?` {
936                 return q_min, q_max, i - in_i + 2, true
937             } else {
938                 i--
939                 return q_min, q_max, i - in_i + 2, false
940             }
941         }
942 
943         // not  a {} quantifier, exit
944         return err_syntax_error, i, 0, false
945     }
946 
947     // not a conform {} quantifier
948     return err_syntax_error, i, 0, false
949 }
950 
951 //
952 // Groups
953 //
954 enum Group_parse_state {
955     start
956     q_mark    // (?
957     q_mark1   // (?:|P  checking
958     p_status  // (?P
959     p_start   // (?P<
960     p_end     // (?P<...>
961     p_in_name // (?P<...
962     finish
963 }
964 
965 // parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
966 fn (re &RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
967     mut status := Group_parse_state.start
968     mut i := in_i
969     mut name := ''
970 
971     for i < in_txt.len && status != .finish {
972         // get our char
973         char_tmp, char_len := re.get_char(in_txt, i)
974         ch := u8(char_tmp)
975 
976         // start
977         if status == .start && ch == `(` {
978             status = .q_mark
979             i += char_len
980             continue
981         }
982 
983         // check for question marks
984         if status == .q_mark && ch == `?` {
985             status = .q_mark1
986             i += char_len
987             continue
988         }
989 
990         // negate group
991         if status == .q_mark1 && ch == `!` {
992             i += char_len
993             return 0, false, true, name, i
994         }
995 
996         // non capturing group
997         if status == .q_mark1 && ch == `:` {
998             i += char_len
999             return 0, false, false, name, i
1000         }
1001 
1002         // enter in P section
1003         if status == .q_mark1 && ch == `P` {
1004             status = .p_status
1005             i += char_len
1006             continue
1007         }
1008 
1009         // not a valid q mark found
1010         if status == .q_mark1 {
1011             // println("NO VALID Q MARK")
1012             return -2, true, false, name, i
1013         }
1014 
1015         if status == .p_status && ch == `<` {
1016             status = .p_start
1017             i += char_len
1018             continue
1019         }
1020 
1021         if status == .p_start && ch != `>` {
1022             status = .p_in_name
1023             name += '${ch:1c}' // TODO: manage utf8 chars
1024             i += char_len
1025             continue
1026         }
1027 
1028         // colect name
1029         if status == .p_in_name && ch != `>` && is_alnum(ch) {
1030             name += '${ch:1c}' // TODO: manage utf8 chars
1031             i += char_len
1032             continue
1033         }
1034 
1035         // end name
1036         if status == .p_in_name && ch == `>` {
1037             i += char_len
1038             return 0, true, false, name, i
1039         }
1040 
1041         // error on name group
1042         if status == .p_in_name {
1043             return -2, true, false, name, i
1044         }
1045 
1046         // normal group, nothig to do, exit
1047         return 0, true, false, name, i
1048     }
1049     // UNREACHABLE
1050     // println("ERROR!! NOT MEANT TO BE HERE!!1")
1051     return -2, true, false, name, i
1052 }
1053 
1054 const quntifier_chars = [rune(`+`), `*`, `?`, `{`]
1055 
1056 //
1057 // main compiler
1058 //
1059 // compile return (return code, index) where index is the index of the error in the query string if return code is an error code
1060 fn (mut re RE) impl_compile(in_txt string) (int, int) {
1061     mut i := 0 // input string index
1062     mut pc := 0 // program counter
1063 
1064     // group management variables
1065     mut group_count := -1
1066     mut group_stack := []int{len: re.group_max_nested, init: 0}
1067     mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
1068     mut group_stack_index := -1
1069 
1070     re.query = in_txt // save the query string
1071 
1072     i = 0
1073     for i < in_txt.len {
1074         mut char_tmp := u32(0)
1075         mut char_len := 0
1076         // println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
1077 
1078         char_tmp, char_len = re.get_char(in_txt, i)
1079 
1080         //
1081         // check special cases: $ ^
1082         //
1083         if char_len == 1 && i == 0 && u8(char_tmp) == `^` {
1084             re.flag |= f_ms
1085             i = i + char_len
1086             continue
1087         }
1088         if char_len == 1 && i == (in_txt.len - 1) && u8(char_tmp) == `$` {
1089             re.flag |= f_me
1090             i = i + char_len
1091             continue
1092         }
1093 
1094         // ist_group_start
1095         if char_len == 1 && pc >= 0 && u8(char_tmp) == `(` {
1096             // check max groups allowed
1097             if group_count > re.group_max {
1098                 return err_groups_overflow, i + 1
1099             }
1100             group_stack_index++
1101 
1102             // check max nested groups allowed
1103             if group_stack_index > re.group_max_nested {
1104                 return err_groups_max_nested, i + 1
1105             }
1106 
1107             tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt, i)
1108 
1109             // manage question mark format error
1110             if tmp_res < -1 {
1111                 return err_group_qm_notation, next_i
1112             }
1113 
1114             // println("Parse group: [${tmp_res}, ${cgroup_flag}, (${i},${next_i}), '${in_txt[i..next_i]}' ]")
1115             i = next_i
1116 
1117             if cgroup_flag == true {
1118                 group_count++
1119             }
1120 
1121             // calculate the group id
1122             // if it is a named group, recycle the group id
1123             // NOTE: **** the group index is +1 because map return 0 when not found!! ****
1124             mut group_id := group_count
1125             if cgroup_name.len > 0 {
1126                 // println("GROUP NAME: ${cgroup_name}")
1127                 if cgroup_name in re.group_map {
1128                     group_id = re.group_map[cgroup_name] - 1
1129                     group_count--
1130                 } else {
1131                     re.group_map[cgroup_name] = group_id + 1
1132                 }
1133             }
1134 
1135             group_stack_txt_index[group_stack_index] = i
1136             group_stack[group_stack_index] = pc
1137 
1138             re.prog[pc].ist = u32(0) | ist_group_start
1139             re.prog[pc].rep_min = 1
1140             re.prog[pc].rep_max = 1
1141 
1142             // manage negation groups
1143             if negate_flag == true {
1144                 re.prog[pc].group_neg = true
1145                 re.prog[pc].rep_min = 0 // may not be caught, but it is ok
1146             }
1147 
1148             // set the group id
1149             if cgroup_flag == false {
1150                 // println("NO CAPTURE GROUP")
1151                 re.prog[pc].group_id = -1
1152             } else {
1153                 re.prog[pc].group_id = group_id
1154             }
1155 
1156             pc = pc + 1
1157             continue
1158         }
1159 
1160         // ist_group_end
1161         if char_len == 1 && pc > 0 && u8(char_tmp) == `)` {
1162             if group_stack_index < 0 {
1163                 return err_group_not_balanced, i + 1
1164             }
1165 
1166             goto_pc := group_stack[group_stack_index]
1167             group_stack_index--
1168 
1169             re.prog[pc].ist = u32(0) | ist_group_end
1170             re.prog[pc].rep_min = 1
1171             re.prog[pc].rep_max = 1
1172 
1173             re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need
1174             re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data
1175 
1176             re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
1177             // re.prog[goto_pc].group_id = group_count         // id of this group, used for storing data
1178 
1179             // duplicate the negation group info and settings
1180             if re.prog[goto_pc].group_neg == true {
1181                 re.prog[pc].group_neg = re.prog[goto_pc].group_neg
1182                 re.prog[pc].rep_min = re.prog[goto_pc].rep_min
1183             }
1184 
1185             pc = pc + 1
1186             i = i + char_len
1187             continue
1188         }
1189 
1190         // ist_dot_char match any char except the following token
1191         if char_len == 1 && pc >= 0 && u8(char_tmp) == `.` {
1192             // consecutive ist_dot_char is a syntax error
1193             if pc > 0 && re.prog[pc - 1].ist == ist_dot_char {
1194                 return err_consecutive_dots, i
1195             }
1196 
1197             re.prog[pc].ist = u32(0) | ist_dot_char
1198             re.prog[pc].rep_min = 1
1199             re.prog[pc].rep_max = 1
1200             pc = pc + 1
1201             i = i + char_len
1202             continue
1203         }
1204 
1205         // OR branch
1206         if char_len == 1 && pc > 0 && u8(char_tmp) == `|` {
1207             if pc > 0 && re.prog[pc - 1].ist == ist_or_branch {
1208                 return err_syntax_error, i
1209             }
1210             re.prog[pc].ist = u32(0) | ist_or_branch
1211             re.prog[pc].source_index = i
1212             pc = pc + 1
1213             i = i + char_len
1214             continue
1215         }
1216 
1217         // Quantifiers
1218         if char_len == 1 && pc > 0 {
1219             mut char_next := rune(0)
1220             mut char_next_len := 0
1221             if (char_len + i) < in_txt.len {
1222                 char_next, char_next_len = re.get_char(in_txt, i + char_len)
1223             }
1224             mut quant_flag := true
1225 
1226             // negation groups can not have quantifiers
1227             if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
1228                 return err_neg_group_quantifier, i
1229             }
1230 
1231             match u8(char_tmp) {
1232                 `?` {
1233                     // println("q: ${char_tmp:c}")
1234                     // check illegal quantifier sequences
1235                     if char_next_len == 1 && char_next in quntifier_chars {
1236                         return err_syntax_error, i
1237                     }
1238                     re.prog[pc - 1].rep_min = 0
1239                     re.prog[pc - 1].rep_max = 1
1240                 }
1241                 `+` {
1242                     // println("q: ${char_tmp:c}")
1243                     // check illegal quantifier sequences
1244                     if char_next_len == 1 && char_next in quntifier_chars {
1245                         return err_syntax_error, i
1246                     }
1247                     re.prog[pc - 1].rep_min = 1
1248                     re.prog[pc - 1].rep_max = max_quantifier
1249                 }
1250                 `*` {
1251                     // println("q: ${char_tmp:c}")
1252                     // check illegal quantifier sequences
1253                     if char_next_len == 1 && char_next in quntifier_chars {
1254                         return err_syntax_error, i
1255                     }
1256                     re.prog[pc - 1].rep_min = 0
1257                     re.prog[pc - 1].rep_max = max_quantifier
1258                 }
1259                 `{` {
1260                     min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1)
1261                     // it is a quantifier
1262                     if min >= 0 {
1263                         // println("{${min},${max}}\n str:[${in_txt[i..i+tmp]}] greedy:${greedy}")
1264                         i = i + tmp
1265                         re.prog[pc - 1].rep_min = min
1266                         re.prog[pc - 1].rep_max = max
1267                         re.prog[pc - 1].greedy = greedy
1268                         // check illegal quantifier sequences
1269                         if i <= in_txt.len {
1270                             char_next, char_next_len = re.get_char(in_txt, i)
1271                             if char_next_len == 1 && char_next in quntifier_chars {
1272                                 return err_syntax_error, i
1273                             }
1274                         }
1275                         continue
1276                     } else {
1277                         return min, i
1278                     }
1279 
1280                     // TODO: decide if the open bracket can be conform without the close bracket
1281                     /*
1282                     // no conform, parse as normal char
1283                     else {
1284                         quant_flag = false
1285                     }
1286                     */
1287                 }
1288                 else {
1289                     quant_flag = false
1290                 }
1291             }
1292 
1293             if quant_flag {
1294                 i = i + char_len
1295                 continue
1296             }
1297         }
1298 
1299         // IST_CHAR_CLASS_*
1300         if char_len == 1 && pc >= 0 {
1301             if u8(char_tmp) == `[` {
1302                 cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1)
1303                 if cc_index >= 0 {
1304                     // println("index: ${cc_index} str:${in_txt[i..i+tmp]}")
1305                     i = i + tmp
1306                     re.prog[pc].ist = u32(0) | cc_type
1307                     re.prog[pc].cc_index = cc_index
1308                     re.prog[pc].rep_min = 1
1309                     re.prog[pc].rep_max = 1
1310                     pc = pc + 1
1311                     continue
1312                 }
1313                 // cc_class vector memory full
1314                 else if cc_index < 0 {
1315                     return cc_index, i
1316                 }
1317             }
1318         }
1319 
1320         // ist_bsls_char
1321         if char_len == 1 && pc >= 0 {
1322             if u8(char_tmp) == `\\` {
1323                 // if the index is negative:
1324                 // -1 ERROR
1325                 // -2 hex byte code BLSL
1326                 bsls_index, tmp, hex_res := re.parse_bsls(in_txt, i)
1327                 if bsls_index >= 0 {
1328                     i = i + tmp
1329                     re.prog[pc].ist = u32(0) | ist_bsls_char
1330                     re.prog[pc].rep_min = 1
1331                     re.prog[pc].rep_max = 1
1332                     re.prog[pc].validator = bsls_validator_array[bsls_index].validator
1333                     re.prog[pc].ch = bsls_validator_array[bsls_index].ch
1334                     pc = pc + 1
1335                     continue
1336                 }
1337                 // hex char
1338                 // this code can mange up to \x for 32 bit
1339                 // at the present time only 8/16 bit are used
1340                 else if bsls_index == -2 {
1341                     mut value := hex_res
1342                     mut value_list := []u32{cap: 4}
1343                     mut count := 0
1344                     for value > 0 {
1345                         value_list << value & 0xFF
1346                         value = value >> 8
1347                         count++
1348                     }
1349 
1350                     count--
1351                     for count >= 0 {
1352                         re.prog[pc].ist = ist_simple_char
1353                         re.prog[pc].ch = value_list[count]
1354                         re.prog[pc].ch_len = u8(char_len)
1355                         re.prog[pc].rep_min = 1
1356                         re.prog[pc].rep_max = 1
1357                         re.prog[pc].flag = 1 // state a byte char
1358                         // println("char: ${char_tmp:c}")
1359                         pc = pc + 1
1360                         count--
1361                     }
1362                     i = i + tmp
1363                     continue
1364                 }
1365                 // this is an escape char, skip the bsls and continue as a normal char
1366                 else if bsls_index == no_match_found {
1367                     i += char_len
1368                     char_tmp, char_len = re.get_char(in_txt, i)
1369                     // continue as simple char
1370                 }
1371                 // if not an escape or a bsls char then it is an error (at least for now!)
1372                 else {
1373                     return bsls_index, i + tmp
1374                 }
1375             }
1376         }
1377 
1378         // ist_simple_char
1379         re.prog[pc].ist = ist_simple_char
1380         re.prog[pc].ch = char_tmp
1381         re.prog[pc].ch_len = u8(char_len)
1382         re.prog[pc].rep_min = 1
1383         re.prog[pc].rep_max = 1
1384         // println("char: ${char_tmp:c}")
1385         pc = pc + 1
1386 
1387         i += char_len
1388     }
1389 
1390     // add end of the program
1391     re.prog[pc].ist = ist_prog_end
1392     re.prog_len = pc
1393 
1394     // check for unbalanced groups
1395     if group_stack_index != -1 {
1396         return err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1
1397     }
1398 
1399     // check for OR at the end of the program
1400     if pc > 0 && re.prog[pc - 1].ist == ist_or_branch {
1401         return err_syntax_error, in_txt.len - 1
1402     }
1403 
1404     // store the number of groups in the query
1405     re.group_count = group_count + 1
1406 
1407     //******************************************
1408     // Post processing
1409     //******************************************
1410 
1411     //
1412     // manage ist_dot_char
1413     //
1414 
1415     // find the checks for dot chars, if any...
1416     mut pc1 := 0
1417     mut dot_char_count := 0
1418     mut last_dot_char_pc := -1
1419     for pc1 < pc {
1420         if re.prog[pc1].ist == ist_dot_char {
1421             // println("Dot_char pc: ${pc1}")
1422             last_dot_char_pc = pc1
1423             dot_char_count++
1424             mut pc2 := pc1 + 1
1425             for pc2 < pc {
1426                 // consecutive dot chars is an error
1427                 if re.prog[pc2].ist == ist_dot_char {
1428                     return err_syntax_error, 0
1429                 }
1430                 if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] {
1431                     // println("Next dot char check is PC: ${pc2}")
1432                     re.prog[pc1].dot_check_pc = pc2
1433                     break
1434                 }
1435                 pc2++
1436             }
1437         }
1438         pc1++
1439     }
1440 
1441     // println("last_dot_char_pc: ${last_dot_char_pc}")
1442     if last_dot_char_pc >= 0 {
1443         pc1 = last_dot_char_pc + 1
1444         mut is_last_dot := true
1445         for pc1 < pc {
1446             if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] {
1447                 is_last_dot = false
1448                 break
1449             }
1450             pc1++
1451         }
1452         if is_last_dot {
1453             re.prog[last_dot_char_pc].last_dot_flag = true
1454         }
1455     }
1456 
1457     //
1458     // manage bsls_char
1459     //
1460 
1461     // find the checks for bsls, if any...
1462     pc1 = 0
1463     mut bsls_char_count := 0
1464     mut last_bsls_char_pc := -1
1465     for pc1 < pc {
1466         if re.prog[pc1].ist == ist_bsls_char {
1467             // println("bsls_char pc: ${pc1}")
1468             last_bsls_char_pc = pc1
1469             bsls_char_count++
1470             mut pc2 := pc1 + 1
1471             for pc2 < pc {
1472                 if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] {
1473                     // println("Next bsls check is PC: ${pc2}")
1474                     re.prog[pc1].bsls_check_pc = pc2
1475                     break
1476                 }
1477                 pc2++
1478             }
1479         }
1480         pc1++
1481     }
1482 
1483     // println('last_bsls_char_pc: ${last_bsls_char_pc}')
1484     if last_bsls_char_pc >= 0 {
1485         pc1 = last_bsls_char_pc + 1
1486         mut is_last_bsls := true
1487         for pc1 < pc {
1488             if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] {
1489                 is_last_bsls = false
1490                 break
1491             }
1492             pc1++
1493         }
1494         if is_last_bsls {
1495             re.prog[last_bsls_char_pc].last_dot_flag = true
1496         }
1497     }
1498 
1499     //
1500     // manage CC
1501     //
1502     pc1 = 0
1503     mut cc_char_count := 0
1504     mut last_cc_char_pc := -1
1505     for pc1 < pc {
1506         if re.prog[pc1].ist in [u32(ist_char_class_pos), ist_char_class_neg] {
1507             last_cc_char_pc = pc1
1508             cc_char_count++
1509             mut pc2 := pc1 + 1
1510             for pc2 < pc {
1511                 if re.prog[pc2].ist !in [u32(ist_prog_end), ist_group_end, ist_group_start] {
1512                     // println("Next CC check is PC: ${pc2}")
1513                     re.prog[pc1].cc_check_pc = pc2
1514                     break
1515                 }
1516                 pc2++
1517             }
1518         }
1519         pc1++
1520     }
1521 
1522     // println('last_cc_char_pc: ${last_cc_char_pc}')
1523     if last_cc_char_pc >= 0 {
1524         pc1 = last_cc_char_pc + 1
1525         mut is_last_cc := true
1526         for pc1 < pc {
1527             if re.prog[pc1].ist !in [u32(ist_prog_end), ist_group_end] {
1528                 is_last_cc = false
1529                 break
1530             }
1531             pc1++
1532         }
1533         if is_last_cc {
1534             re.prog[last_cc_char_pc].last_dot_flag = true
1535         }
1536     }
1537 
1538     //******************************************
1539 
1540     // OR branch
1541     // a|b|cd
1542     // d exit point
1543     // a,b,c branches
1544     // set the jump in the right places
1545     pc1 = 0
1546     for pc1 < pc - 2 {
1547         // println("Here ${pc1} ${pc-2}")
1548         // println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}")
1549         if re.prog[pc1 + 1].ist == ist_or_branch {
1550             // two consecutive OR are a syntax error
1551             if re.prog[pc1 + 2].ist == ist_or_branch {
1552                 return err_syntax_error, i
1553             }
1554 
1555             // check for []|[] errors
1556             if re.prog[pc1].ist == ist_char_class_pos && re.prog[pc1 + 2].ist == ist_char_class_pos {
1557                 return err_invalid_or_with_cc, re.prog[pc1 + 1].source_index
1558             }
1559         }
1560 
1561         // manange a|b chains like a|(b)|c|d...
1562         // standard solution
1563         if re.prog[pc1].ist != ist_or_branch && re.prog[pc1 + 1].ist == ist_or_branch
1564             && re.prog[pc1 + 2].ist != ist_or_branch {
1565             re.prog[pc1].next_is_or = true // set that the next token is an  OR
1566             re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump
1567 
1568             // match jump, if an OR chain the next token will be an OR token
1569             mut pc2 := pc1 + 2
1570             for pc2 < pc - 1 {
1571                 ist := re.prog[pc2].ist
1572                 if ist == ist_group_start {
1573                     re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1
1574                     break
1575                 }
1576                 if ist != ist_or_branch {
1577                     re.prog[pc1 + 1].rep_max = pc2 + 1
1578                     break
1579                 }
1580 
1581                 pc2++
1582             }
1583             // special case query of few chars, the true can't go on the first instruction
1584             if re.prog[pc1 + 1].rep_max == pc1 {
1585                 re.prog[pc1 + 1].rep_max = 3
1586             }
1587             // println("Compile OR postproc. [${pc1},OR ${pc1+1},${pc2}]")
1588             pc1 = pc2
1589             continue
1590         }
1591 
1592         pc1++
1593     }
1594 
1595     //******************************************
1596     // DEBUG PRINT REGEX GENERATED CODE
1597     //******************************************
1598     if re.debug > 0 {
1599         gc := re.get_code()
1600         re.log_func(gc)
1601     }
1602     //******************************************
1603 
1604     return compile_ok, 0
1605 }
1606 
1607 // get_code return the compiled code as regex string, note: may be different from the source!
1608 pub fn (re &RE) get_code() string {
1609     mut pc1 := 0
1610     mut res := strings.new_builder(re.cc.len * 2 * re.prog.len)
1611     res.write_string('========================================\nv RegEx compiler v ${v_regex_version} output:\n')
1612 
1613     mut stop_flag := false
1614 
1615     for pc1 <= re.prog.len {
1616         tk := re.prog[pc1]
1617         res.write_string('PC:${pc1:3d}')
1618 
1619         res.write_string(' ist: ')
1620         res.write_string('${tk.ist:8x}'.replace(' ', '0'))
1621         res.write_string(' ')
1622         ist := tk.ist
1623         if ist == ist_bsls_char {
1624             res.write_string('[\\${tk.ch:1c}]     BSLS')
1625             if tk.last_dot_flag == true {
1626                 res.write_string(' last!')
1627             }
1628         } else if ist == ist_prog_end {
1629             res.write_string('PROG_END')
1630             stop_flag = true
1631         } else if ist == ist_or_branch {
1632             res.write_string('OR      ')
1633         } else if ist == ist_char_class_pos {
1634             res.write_string('[${re.get_char_class(pc1)}]     CHAR_CLASS_POS')
1635             if tk.last_dot_flag == true {
1636                 res.write_string(' last!')
1637             }
1638         } else if ist == ist_char_class_neg {
1639             res.write_string('[^${re.get_char_class(pc1)}]    CHAR_CLASS_NEG')
1640             if tk.last_dot_flag == true {
1641                 res.write_string(' last!')
1642             }
1643         } else if ist == ist_dot_char {
1644             res.write_string('.        DOT_CHAR nx chk: ${tk.dot_check_pc}')
1645             if tk.last_dot_flag == true {
1646                 res.write_string(' last!')
1647             }
1648         } else if ist == ist_group_start {
1649             res.write_string('(        GROUP_START #:${tk.group_id}')
1650             if tk.group_id == -1 {
1651                 res.write_string(' ?:')
1652             } else {
1653                 for x in re.group_map.keys() {
1654                     if re.group_map[x] == (tk.group_id + 1) {
1655                         res.write_string(' ?P<${x}>')
1656                         break
1657                     }
1658                 }
1659             }
1660         } else if ist == ist_group_end {
1661             res.write_string(')        GROUP_END   #:${tk.group_id}')
1662         } else if ist == ist_simple_char {
1663             if tk.flag == 0 {
1664                 res.write_string('[${tk.ch:1c}]      query_ch')
1665             } else {
1666                 res.write_string('[0x${tk.ch:02X}]HEXquery_ch')
1667             }
1668         }
1669 
1670         if tk.rep_max == max_quantifier {
1671             res.write_string(' {${tk.rep_min:3d},MAX}')
1672         } else {
1673             if ist == ist_or_branch {
1674                 res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}')
1675             } else {
1676                 res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}')
1677             }
1678             if tk.greedy == true {
1679                 res.write_string('?')
1680             }
1681         }
1682 
1683         res.write_string('\n')
1684         if stop_flag {
1685             break
1686         }
1687         pc1++
1688     }
1689 
1690     res.write_string('========================================\n')
1691     return res.str()
1692 }
1693 
1694 // get_query return a string with a reconstruction of the query starting from the regex program code
1695 pub fn (re &RE) get_query() string {
1696     mut res := strings.new_builder(re.query.len * 2)
1697 
1698     if (re.flag & f_ms) != 0 {
1699         res.write_string('^')
1700     }
1701 
1702     mut i := 0
1703     for i < re.prog.len && re.prog[i].ist != ist_prog_end && re.prog[i].ist != 0 {
1704         tk := unsafe { &re.prog[i] }
1705         ch := tk.ist
1706 
1707         // GROUP start
1708         if ch == ist_group_start {
1709             if re.debug > 0 {
1710                 res.write_string('#${tk.group_id}')
1711             }
1712             res.write_string('(')
1713 
1714             if tk.group_neg == true {
1715                 res.write_string('?!') // negation group
1716             } else if tk.group_id == -1 {
1717                 res.write_string('?:') // non capturing group
1718             }
1719 
1720             for x in re.group_map.keys() {
1721                 if re.group_map[x] == (tk.group_id + 1) {
1722                     res.write_string('?P<${x}>')
1723                     break
1724                 }
1725             }
1726 
1727             i++
1728             continue
1729         }
1730 
1731         // GROUP end
1732         if ch == ist_group_end {
1733             res.write_string(')')
1734         }
1735 
1736         // OR branch
1737         if ch == ist_or_branch {
1738             res.write_string('|')
1739             if re.debug > 0 {
1740                 res.write_string('{${tk.rep_min},${tk.rep_max}}')
1741             }
1742             i++
1743             continue
1744         }
1745 
1746         // char class
1747         if ch == ist_char_class_neg || ch == ist_char_class_pos {
1748             res.write_string('[')
1749             if ch == ist_char_class_neg {
1750                 res.write_string('^')
1751             }
1752             res.write_string('${re.get_char_class(i)}')
1753             res.write_string(']')
1754         }
1755 
1756         // bsls char
1757         if ch == ist_bsls_char {
1758             res.write_string('\\${tk.ch:1c}')
1759         }
1760 
1761         // ist_dot_char
1762         if ch == ist_dot_char {
1763             res.write_string('.')
1764         }
1765 
1766         // char alone
1767         if ch == ist_simple_char {
1768             if tk.flag == 0 {
1769                 if u8(ch) in bsls_escape_list {
1770                     res.write_string('\\')
1771                 }
1772                 res.write_string('${tk.ch:c}')
1773             } else {
1774                 res.write_string('\\x${tk.ch:02x}')
1775             }
1776         }
1777 
1778         // quantifier
1779         if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
1780             if tk.rep_min == 0 && tk.rep_max == 1 {
1781                 res.write_string('?')
1782             } else if tk.rep_min == 1 && tk.rep_max == max_quantifier {
1783                 res.write_string('+')
1784             } else if tk.rep_min == 0 && tk.rep_max == max_quantifier {
1785                 res.write_string('*')
1786             } else {
1787                 if tk.rep_max == max_quantifier {
1788                     res.write_string('{${tk.rep_min},MAX}')
1789                 } else {
1790                     res.write_string('{${tk.rep_min},${tk.rep_max}}')
1791                 }
1792                 if tk.greedy == true {
1793                     res.write_string('?')
1794                 }
1795             }
1796         }
1797         i++
1798     }
1799     if (re.flag & f_me) != 0 {
1800         res.write_string('$')
1801     }
1802 
1803     return res.str()
1804 }
1805 
1806 /******************************************************************************
1807 *
1808 * Groups saving utilities
1809 *
1810 ******************************************************************************/
1811 @[direct_array_access]
1812 fn (mut re RE) group_continuous_save(g_index int) {
1813     if re.group_csave_flag == true {
1814         // continuous save, save until we have space
1815 
1816         // init the first element as counter
1817         if re.group_csave.len == 0 {
1818             re.group_csave << 0
1819         }
1820 
1821         gi := g_index >> 1
1822         start := re.groups[g_index]
1823         end := re.groups[g_index + 1]
1824 
1825         // check if we are simply increasing the size ot the found group
1826         if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3]
1827             && start == re.group_csave[re.group_csave.len - 2] {
1828             re.group_csave[re.group_csave.len - 1] = end
1829             return
1830         }
1831 
1832         // otherwise append a new group to the list
1833 
1834         // increment counter
1835         re.group_csave[0]++
1836         // save the record
1837         re.group_csave << (g_index >> 1) // group id
1838         re.group_csave << re.groups[g_index] // start
1839         re.group_csave << re.groups[g_index + 1] // end
1840     }
1841 }
1842 
1843 /******************************************************************************
1844 *
1845 * Matching
1846 *
1847 ******************************************************************************/
1848 enum Match_state {
1849     start = 0
1850     stop
1851     end
1852     new_line
1853     ist_load     // load and execute instruction
1854     ist_next     // go to next instruction
1855     ist_next_ks  // go to next instruction without clenaning the state
1856     ist_quant_p  // match positive ,quantifier check
1857     ist_quant_n  // match negative, quantifier check
1858     ist_quant_pg // match positive ,group quantifier check
1859     ist_quant_ng // match negative ,group quantifier check
1860 }
1861 
1862 fn state_str(s Match_state) string {
1863     match s {
1864         .start { return 'start' }
1865         .stop { return 'stop' }
1866         .end { return 'end' }
1867         .new_line { return 'new line' }
1868         .ist_load { return 'ist_load' }
1869         .ist_next { return 'ist_next' }
1870         .ist_next_ks { return 'ist_next_ks' }
1871         .ist_quant_p { return 'ist_quant_p' }
1872         .ist_quant_n { return 'ist_quant_n' }
1873         .ist_quant_pg { return 'ist_quant_pg' }
1874         .ist_quant_ng { return 'ist_quant_ng' }
1875     }
1876 }
1877 
1878 struct StateObj {
1879 pub mut:
1880     group_index int = -1 // group id used to know how many groups are open
1881     match_flag  bool // indicate if we are in a match condition
1882     match_index int = -1 // index of the last match
1883     first_match int = -1 // index of the first match
1884     pc          int = -1 // program counter
1885     i           int = -1 // source string index
1886     char_len    int // last char legth
1887     last_dot_pc int = -1 // last dot chat pc
1888 }
1889 
1890 @[direct_array_access]
1891 pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
1892     // result status
1893     mut result := no_match_found // function return
1894 
1895     mut ch := rune(0) // examinated char
1896     mut char_len := 0 // utf8 examinated char len
1897     mut m_state := Match_state.start // start point for the matcher FSM
1898     mut src_end := false
1899     mut last_fnd_pc := -1
1900 
1901     mut state := StateObj{} // actual state
1902     mut ist := u32(0) // actual instruction
1903     mut l_ist := u32(0) // last matched instruction
1904 
1905     mut step_count := 0 // stats for debug
1906     mut dbg_line := 0 // count debug line printed
1907 
1908     re.reset()
1909 
1910     if re.debug > 0 {
1911         // print header
1912         mut h_buf := strings.new_builder(32)
1913         h_buf.write_string('flags: ')
1914         h_buf.write_string('${re.flag:8x}'.replace(' ', '0'))
1915         h_buf.write_string('\n')
1916         sss := h_buf.str()
1917         re.log_func(sss)
1918     }
1919 
1920     for m_state != .end {
1921         if state.pc >= 0 && state.pc < re.prog.len {
1922             ist = re.prog[state.pc].ist
1923         } else if state.pc >= re.prog.len {
1924             // println("ERROR!! PC overflow!!")
1925             return err_internal_error, state.i
1926         }
1927 
1928         //******************************************
1929         // DEBUG LOG
1930         //******************************************
1931         if re.debug > 0 {
1932             mut buf2 := strings.new_builder(re.cc.len + 128)
1933 
1934             // print all the instructions
1935 
1936             // end of the input text
1937             if state.i >= in_txt_len {
1938                 buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n')
1939                 sss := buf2.str()
1940                 re.log_func(sss)
1941             } else {
1942                 // print only the exe instruction
1943                 if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 {
1944                     if ist == ist_prog_end {
1945                         buf2.write_string('# ${step_count:3d} PROG_END\n')
1946                     } else if ist == 0 || m_state in [.start, .ist_next, .stop] {
1947                         buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n')
1948                     } else {
1949                         ch, char_len = re.get_charb(in_txt, state.i)
1950 
1951                         buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>')
1952                         buf2.write_string('${ist:8x}'.replace(' ', '0'))
1953                         buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
1954 
1955                         if ist == ist_simple_char {
1956                             if re.prog[state.pc].flag == 0 {
1957                                 buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
1958                             } else {
1959                                 buf2.write_string('query_ch: [0x${re.prog[state.pc].ch:02X}]')
1960                             }
1961                         } else {
1962                             if ist == ist_bsls_char {
1963                                 buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
1964                             } else if ist == ist_prog_end {
1965                                 buf2.write_string('PROG_END')
1966                             } else if ist == ist_or_branch {
1967                                 buf2.write_string('OR')
1968                             } else if ist == ist_char_class_pos {
1969                                 buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]')
1970                             } else if ist == ist_char_class_neg {
1971                                 buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]')
1972                             } else if ist == ist_dot_char {
1973                                 buf2.write_string('DOT_CHAR')
1974                             } else if ist == ist_group_start {
1975                                 tmp_gi := re.prog[state.pc].group_id
1976                                 tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep
1977                                 buf2.write_string('GROUP_START #:${tmp_gi} rep:${tmp_gr} ')
1978                             } else if ist == ist_group_end {
1979                                 buf2.write_string('GROUP_END   #:${re.prog[state.pc].group_id} deep:${state.group_index}')
1980                             }
1981                         }
1982                         if re.prog[state.pc].rep_max == max_quantifier {
1983                             buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}')
1984                         } else {
1985                             buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}')
1986                         }
1987                         if re.prog[state.pc].greedy == true {
1988                             buf2.write_string('?')
1989                         }
1990                         buf2.write_string(' (#${state.group_index})')
1991 
1992                         if ist == ist_dot_char {
1993                             buf2.write_string(' last!')
1994                         }
1995 
1996                         buf2.write_string('\n')
1997                     }
1998                     sss2 := buf2.str()
1999                     re.log_func(sss2)
2000                 }
2001             }
2002             step_count++
2003             dbg_line++
2004         }
2005         //******************************************
2006 
2007         if ist == ist_prog_end {
2008             // println("HERE we end!")
2009             break
2010         }
2011 
2012         // we're out of text, manage it
2013         if state.i >= in_txt_len || m_state == .new_line {
2014             // println("Finished text!!")
2015             src_end = true
2016 
2017             // we have fished the text, we must manage out pf bound indexes
2018             if state.i >= in_txt_len {
2019                 state.i = in_txt_len - 1
2020             }
2021 
2022             // manage groups
2023             if state.group_index >= 0 && state.match_index >= 0 {
2024                 // println("End text with open groups!")
2025                 // println("state.group_index: ${state.group_index}")
2026                 // close the groups
2027                 for state.group_index >= 0 {
2028                     tmp_pc := re.group_data[state.group_index]
2029                     re.prog[tmp_pc].group_rep++
2030                     // println("Closing group ${state.group_index} {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
2031 
2032                     if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min
2033                         && re.prog[tmp_pc].group_id >= 0 {
2034                         start_i := re.group_stack[state.group_index]
2035                         re.group_stack[state.group_index] = -1
2036 
2037                         // save group results
2038                         g_index := re.prog[tmp_pc].group_id * 2
2039                         // println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
2040                         if start_i >= 0 {
2041                             re.groups[g_index] = start_i
2042                         } else {
2043                             re.groups[g_index] = 0
2044                         }
2045 
2046                         re.groups[g_index + 1] = state.i
2047 
2048                         if re.groups[g_index + 1] >= in_txt_len {
2049                             // println("clamp group on stop!")
2050                             re.groups[g_index + 1] = in_txt_len - 1
2051                         }
2052 
2053                         // continuous save, save until we have space
2054                         re.group_continuous_save(g_index)
2055                     }
2056                     state.group_index--
2057                 }
2058             }
2059 
2060             // println("re.groups: ${re.groups}")
2061 
2062             // the text is finished and the groups closed and we are the last group, ok exit
2063             if ist == ist_group_end && re.prog[state.pc + 1].ist == ist_prog_end {
2064                 // println("Last group end")
2065                 return state.first_match, state.i
2066             }
2067 
2068             if state.pc == -1 {
2069                 state.pc = last_fnd_pc
2070             }
2071 
2072             // println("Finished text!!")
2073             // println("Instruction: ${ist:08x} pc: ${state.pc}")
2074             // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}")
2075 
2076             // program end
2077             if ist == ist_prog_end {
2078                 // println("Program end on end of text!")
2079                 return state.first_match, state.i
2080             }
2081 
2082             if l_ist in [
2083                 u32(ist_char_class_neg),
2084                 ist_char_class_pos,
2085                 ist_bsls_char,
2086                 ist_dot_char,
2087             ] {
2088                 // println("***** We have a last special token")
2089                 // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
2090                 // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
2091                 // println("first match: ${state.first_match}")
2092 
2093                 if re.prog[state.pc].last_dot_flag == true
2094                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min
2095                     && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
2096                     return state.first_match, state.i
2097                 }
2098                 // println("Not fitted!!")
2099             }
2100             // no groups open, check the last token quantifier
2101             if ist != ist_group_end && re.prog[state.pc + 1].ist == ist_prog_end {
2102                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
2103                     && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
2104                     // println("We are in good repetition")
2105                     return state.first_match, state.i
2106                 }
2107             }
2108 
2109             // println("No good exit!!")
2110             if re.prog[re.prog_len - 1].ist == ist_group_end {
2111                 // println("last ist is a group end!")
2112                 if re.prog[re.prog_len - 1].group_rep >= re.prog[re.prog_len - 1].rep_min {
2113                     return state.first_match, state.i
2114                 }
2115             }
2116             return no_match_found, state.i
2117         }
2118 
2119         // starting and init
2120         if m_state == .start {
2121             state.pc = -1
2122             state.i = 0
2123             m_state = .ist_next
2124             continue
2125         }
2126         // ist_next, next instruction reseting its state
2127         else if m_state == .ist_next {
2128             state.pc = state.pc + 1
2129             re.prog[state.pc].reset()
2130             // check if we are in the program bounds
2131             if state.pc < 0 || state.pc > re.prog.len {
2132                 // println("ERROR!! PC overflow!!")
2133                 return err_internal_error, state.i
2134             }
2135             m_state = .ist_load
2136             continue
2137         }
2138         // ist_next_ks, next instruction keeping its state
2139         else if m_state == .ist_next_ks {
2140             state.pc = state.pc + 1
2141             // check if we are in the program bounds
2142             if state.pc < 0 || state.pc > re.prog.len {
2143                 // println("ERROR!! PC overflow!!")
2144                 return err_internal_error, state.i
2145             }
2146             m_state = .ist_load
2147             continue
2148         }
2149 
2150         // load the char
2151         ch, char_len = re.get_charb(in_txt, state.i)
2152 
2153         // check new line if flag f_nl enabled
2154         if (re.flag & f_nl) != 0 && char_len == 1 && u8(ch) in new_line_list {
2155             m_state = .new_line
2156             continue
2157         }
2158         // check if stop
2159         else if m_state == .stop {
2160             // we are in search mode, don't exit until the end
2161             if (re.flag & f_src) != 0 && ist != ist_prog_end {
2162                 last_fnd_pc = state.pc
2163                 state.pc = -1
2164                 state.i += char_len
2165 
2166                 m_state = .ist_next
2167                 re.reset_src()
2168                 state.match_index = -1
2169                 state.first_match = -1
2170 
2171                 // reset state list
2172                 re.reset()
2173 
2174                 continue
2175             }
2176 
2177             if ist == ist_prog_end {
2178                 return state.first_match, state.i
2179             }
2180 
2181             // manage here dot char
2182 
2183             if re.state_list.len > 0 {
2184                 // println("Here we are, with stop: state buffer: [${re.state_list.len}]")
2185                 state = re.state_list.pop()
2186 
2187                 state.match_flag = true
2188                 l_ist = u32(ist_dot_char)
2189 
2190                 if state.first_match < 0 {
2191                     state.first_match = state.i
2192                 }
2193                 state.match_index = state.i
2194                 re.prog[state.pc].rep++ // increase repetitions
2195 
2196                 state.i += char_len
2197                 m_state = .ist_quant_p
2198                 continue
2199             }
2200 
2201             // exit on no match
2202             return result, state.i
2203         }
2204         // ist_load
2205         else if m_state == .ist_load {
2206             // program end
2207             if ist == ist_prog_end {
2208                 // if we are in match exit well
2209 
2210                 if state.group_index >= 0 && state.match_index >= 0 {
2211                     state.group_index = -1
2212                 }
2213 
2214                 m_state = .stop
2215                 continue
2216             }
2217             // check GROUP start, no quantifier is checkd for this token!!
2218             else if ist == ist_group_start {
2219                 state.group_index++
2220                 re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
2221                 re.group_stack[state.group_index] = state.i // index where we start to manage
2222                 // println("group_index ${state.group_index} rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
2223 
2224                 m_state = .ist_next
2225                 continue
2226             }
2227             // check GROUP end
2228             else if ist == ist_group_end {
2229                 // we are in matching streak
2230                 // println("Group END!! last ist: ${l_ist:08x}")
2231                 if state.match_index >= 0 {
2232                     // restore txt index stack and save the group data
2233 
2234                     // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
2235                     if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
2236                         start_i := re.group_stack[state.group_index]
2237 
2238                         // save group results
2239                         g_index := re.prog[state.pc].group_id * 2
2240 
2241                         if start_i >= 0 {
2242                             re.groups[g_index] = start_i
2243                         } else {
2244                             re.groups[g_index] = 0
2245                         }
2246 
2247                         re.groups[g_index + 1] = state.i
2248 
2249                         if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] {
2250                             re.groups[g_index] = re.groups[g_index - 1]
2251                         }
2252 
2253                         if re.groups[g_index + 1] >= in_txt_len {
2254                             // println("clamp group!")
2255                             re.groups[g_index + 1] = in_txt_len - 1
2256                         }
2257 
2258                         // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: ${state.i} in_txt_len: ${in_txt_len}")
2259 
2260                         // continuous save, save until we have space
2261                         re.group_continuous_save(g_index)
2262                     }
2263 
2264                     re.prog[state.pc].group_rep++ // increase repetitions
2265                     // println("GROUP ${group_index} END ${re.prog[state.pc].group_rep}")
2266                     if re.prog[state.pc].group_rep > in_txt_len - 1 {
2267                         m_state = .ist_quant_ng
2268                         continue
2269                     }
2270 
2271                     m_state = .ist_quant_pg
2272                     continue
2273                 }
2274 
2275                 m_state = .ist_quant_ng
2276                 continue
2277             }
2278             // check OR
2279             else if ist == ist_or_branch {
2280                 if state.match_index >= 0 {
2281                     state.pc = re.prog[state.pc].rep_max
2282                     // println("ist_or_branch True pc: ${state.pc}")
2283                 } else {
2284                     state.pc = re.prog[state.pc].rep_min
2285                     // println("ist_or_branch False pc: ${state.pc}")
2286                 }
2287                 re.prog[state.pc].reset()
2288                 m_state = .ist_load
2289                 continue
2290             }
2291             // check ist_dot_char
2292             else if ist == ist_dot_char {
2293                 // println("ist_dot_char rep: ${re.prog[state.pc].rep}")
2294 
2295                 // check next token to be false
2296                 mut next_check_flag := false
2297 
2298                 // if we are done with max go on dot char are dedicated case!!
2299                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2300                     re.state_list.pop()
2301                     m_state = .ist_next
2302                     continue
2303                 }
2304 
2305                 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].dot_check_pc >= 0
2306                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2307                     // load the char
2308                     // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2309                     ch_t := ch
2310                     chk_pc := re.prog[state.pc].dot_check_pc
2311                     next_check_flag = re.token_matches(chk_pc, ch_t)
2312                 }
2313 
2314                 // check if we must continue or pass to the next IST
2315                 if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end {
2316                     // println("save the state!!")
2317                     mut dot_state := StateObj{
2318                         group_index: state.group_index
2319                         match_flag:  state.match_flag
2320                         match_index: state.match_index
2321                         first_match: state.first_match
2322                         pc:          state.pc
2323                         i:           state.i + char_len
2324                         char_len:    char_len
2325                         last_dot_pc: state.pc
2326                     }
2327                     // if we are mananging a .* stay on the same char on return
2328                     if re.prog[state.pc].rep_min == 0 {
2329                         dot_state.i -= char_len
2330                     }
2331 
2332                     re.state_list << dot_state
2333 
2334                     m_state = .ist_quant_n
2335                     // println("dot_char stack len: ${re.state_list.len}")
2336                     continue
2337                 }
2338 
2339                 state.match_flag = true
2340                 l_ist = u32(ist_dot_char)
2341 
2342                 if state.first_match < 0 {
2343                     state.first_match = state.i
2344                 }
2345                 state.match_index = state.i
2346                 re.prog[state.pc].rep++ // increase repetitions
2347 
2348                 state.i += char_len
2349                 m_state = .ist_quant_p
2350                 continue
2351             }
2352             // char class IST
2353             else if ist == ist_char_class_pos || ist == ist_char_class_neg {
2354                 // check next token to be false
2355                 mut next_check_flag := false
2356 
2357                 // if we are done with max go on dot char are dedicated case!!
2358                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2359                     re.state_list.pop()
2360                     m_state = .ist_next
2361                     continue
2362                 }
2363 
2364                 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].cc_check_pc >= 0
2365                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2366                     // load the char
2367                     // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2368                     ch_t := ch
2369                     chk_pc := re.prog[state.pc].cc_check_pc
2370                     next_check_flag = re.token_matches(chk_pc, ch_t)
2371                 }
2372 
2373                 // check if we must continue or pass to the next IST
2374                 if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end {
2375                     // println("save the state!!")
2376                     mut dot_state := StateObj{
2377                         group_index: state.group_index
2378                         match_flag:  state.match_flag
2379                         match_index: state.match_index
2380                         first_match: state.first_match
2381                         pc:          state.pc
2382                         i:           state.i + char_len
2383                         char_len:    char_len
2384                         last_dot_pc: state.pc
2385                     }
2386                     // if we are managing a \[something]* stay on the same char on return
2387                     if re.prog[state.pc].rep_min == 0 {
2388                         dot_state.i -= char_len
2389                     }
2390 
2391                     re.state_list << dot_state
2392 
2393                     m_state = .ist_quant_n
2394                     // println("dot_char stack len: ${re.state_list.len}")
2395                     continue
2396                 }
2397 
2398                 // println("HERE WE MUST STAY! ${state.i} >= ${in_txt_len}")
2399 
2400                 state.match_flag = false
2401                 mut cc_neg := false
2402 
2403                 if ist == ist_char_class_neg {
2404                     cc_neg = true
2405                 }
2406                 mut cc_res := re.check_char_class(state.pc, ch)
2407 
2408                 // manage out of text on char class parse
2409                 if state.i >= (in_txt_len - 1) && cc_neg && re.prog[state.pc].last_dot_flag {
2410                     m_state = .ist_quant_n
2411                     continue
2412                 }
2413 
2414                 if cc_neg {
2415                     cc_res = !cc_res
2416                 }
2417 
2418                 if cc_res {
2419                     state.match_flag = true
2420                     l_ist = u32(ist_char_class_pos)
2421 
2422                     if state.first_match < 0 {
2423                         state.first_match = state.i
2424                     }
2425 
2426                     state.match_index = state.i
2427 
2428                     re.prog[state.pc].rep++ // increase repetitions
2429                     state.i += char_len // next char
2430                     m_state = .ist_quant_p
2431                     continue
2432                 }
2433                 m_state = .ist_quant_n
2434                 continue
2435             }
2436             // check bsls
2437             else if ist == ist_bsls_char {
2438                 // println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
2439 
2440                 // check next token to be false
2441                 mut next_check_flag := false
2442 
2443                 // if we are done with max go on dot char are dedicated case!!
2444                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2445                     re.state_list.pop()
2446                     m_state = .ist_next
2447                     continue
2448                 }
2449 
2450                 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].bsls_check_pc >= 0
2451                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2452                     // load the char
2453                     // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2454                     ch_t := ch
2455                     chk_pc := re.prog[state.pc].bsls_check_pc
2456                     next_check_flag = re.token_matches(chk_pc, ch_t)
2457                 }
2458 
2459                 // check if we must continue or pass to the next IST
2460                 if next_check_flag == true && re.prog[state.pc + 1].ist != ist_prog_end {
2461                     // println("save the state!!")
2462                     mut dot_state := StateObj{
2463                         group_index: state.group_index
2464                         match_flag:  state.match_flag
2465                         match_index: state.match_index
2466                         first_match: state.first_match
2467                         pc:          state.pc
2468                         i:           state.i + char_len
2469                         char_len:    char_len
2470                         last_dot_pc: state.pc
2471                     }
2472                     // if we are managing a \[something]* stay on the same char on return
2473                     if re.prog[state.pc].rep_min == 0 {
2474                         dot_state.i -= char_len
2475                     }
2476 
2477                     re.state_list << dot_state
2478 
2479                     m_state = .ist_quant_n
2480                     // println("dot_char stack len: ${re.state_list.len}")
2481                     continue
2482                 }
2483 
2484                 tmp_res := re.validator_matches(re.prog[state.pc].validator, ch)
2485                 if tmp_res == false {
2486                     m_state = .ist_quant_n
2487                     continue
2488                 }
2489                 // println("${ch} => ${tmp_res}")
2490 
2491                 state.match_flag = true
2492                 l_ist = u32(ist_dot_char)
2493 
2494                 if state.first_match < 0 {
2495                     state.first_match = state.i
2496                 }
2497                 state.match_index = state.i
2498                 re.prog[state.pc].rep++ // increase repetitions
2499 
2500                 state.i += char_len
2501                 m_state = .ist_quant_p
2502                 continue
2503             }
2504             // simple char IST
2505             else if ist == ist_simple_char {
2506                 // println("ist_simple_char")
2507                 state.match_flag = false
2508 
2509                 if re.chars_equal(re.prog[state.pc].ch, ch)
2510                     && (state.i < in_txt_len - 1 || re.prog[state.pc].ch != 0) {
2511                     state.match_flag = true
2512                     l_ist = ist_simple_char
2513 
2514                     if state.first_match < 0 {
2515                         state.first_match = state.i
2516                     }
2517                     // println("state.match_index: ${state.match_index}")
2518                     state.match_index = state.i
2519 
2520                     re.prog[state.pc].rep++ // increase repetitions
2521                     state.i += char_len // next char
2522                     m_state = .ist_quant_p
2523                     continue
2524                 }
2525                 m_state = .ist_quant_n
2526                 continue
2527             }
2528             // UNREACHABLE
2529             // println("PANIC2!! state: ${m_state}")
2530             return err_internal_error, state.i
2531         }
2532         /***********************************
2533         * Quantifier management
2534         ***********************************/
2535         // ist_quant_ng => quantifier negative test on group
2536         else if m_state == .ist_quant_ng {
2537             // we are finished here
2538             if state.group_index < 0 {
2539                 // println("Early stop!")
2540                 result = no_match_found
2541                 m_state = .stop
2542                 continue
2543             }
2544 
2545             tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
2546             rep := re.prog[tmp_pc].group_rep // use a temp variable
2547             re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2548 
2549             // println(".ist_quant_ng group_pc_end: ${tmp_pc} rep: ${rep}")
2550 
2551             if rep >= re.prog[tmp_pc].rep_min {
2552                 // println("ist_quant_ng GROUP CLOSED OK group_index: ${state.group_index}")
2553 
2554                 state.i = re.group_stack[state.group_index]
2555                 state.pc = tmp_pc
2556                 state.group_index--
2557                 m_state = .ist_next
2558                 continue
2559             } else if re.prog[tmp_pc].next_is_or {
2560                 // println("ist_quant_ng OR Negative branch")
2561 
2562                 state.i = re.group_stack[state.group_index]
2563                 state.pc = re.prog[tmp_pc + 1].rep_min - 1
2564                 state.group_index--
2565                 m_state = .ist_next
2566                 continue
2567             } else if rep > 0 && rep < re.prog[tmp_pc].rep_min {
2568                 // println("ist_quant_ng UNDER THE MINIMUM g.i: ${state.group_index}")
2569 
2570                 // check if we are inside a group, if yes exit from the nested groups
2571                 if state.group_index > 0 {
2572                     state.group_index--
2573                     state.pc = tmp_pc
2574                     m_state = .ist_quant_ng //.ist_next
2575                     continue
2576                 }
2577 
2578                 if state.group_index == 0 {
2579                     state.group_index--
2580                     state.pc = tmp_pc // TEST
2581                     m_state = .ist_next
2582                     continue
2583                 }
2584 
2585                 result = no_match_found
2586                 m_state = .stop
2587                 continue
2588             } else if rep == 0 && rep < re.prog[tmp_pc].rep_min {
2589                 // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: ${state.group_index}")
2590 
2591                 if state.group_index > 0 {
2592                     state.group_index--
2593                     state.pc = tmp_pc
2594                     m_state = .ist_quant_ng //.ist_next
2595                     continue
2596                 }
2597 
2598                 result = no_match_found
2599                 m_state = .stop
2600                 continue
2601             }
2602 
2603             // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${rep}")
2604             // UNREACHABLE
2605             return err_internal_error, state.i
2606         }
2607         // ist_quant_pg => quantifier positive test on group
2608         else if m_state == .ist_quant_pg {
2609             // println(".ist_quant_pg")
2610             mut tmp_pc := state.pc
2611             if state.group_index >= 0 {
2612                 tmp_pc = re.group_data[state.group_index]
2613             }
2614 
2615             if re.prog[tmp_pc].group_neg == true {
2616                 // println("***** Negation of the group")
2617                 result = no_match_found
2618                 m_state = .stop
2619                 continue
2620             }
2621 
2622             rep := re.prog[tmp_pc].group_rep
2623 
2624             if rep < re.prog[tmp_pc].rep_min {
2625                 // println("ist_quant_pg UNDER RANGE")
2626                 state.pc = re.prog[tmp_pc].goto_pc
2627                 m_state = .ist_next
2628                 continue
2629             } else if rep == re.prog[tmp_pc].rep_max {
2630                 // println("ist_quant_pg MAX RANGE")
2631                 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2632                 state.group_index--
2633                 m_state = .ist_next
2634 
2635                 continue
2636             } else if rep >= re.prog[tmp_pc].rep_min {
2637                 // println("ist_quant_pg IN RANGE group_index:${state.group_index}")
2638 
2639                 // check greedy flag, if true exit on minimum
2640                 if re.prog[tmp_pc].greedy == true {
2641                     re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2642                     state.group_index--
2643                     m_state = .ist_next
2644                     continue
2645                 }
2646 
2647                 state.pc = re.prog[tmp_pc].goto_pc - 1
2648                 state.group_index--
2649                 m_state = .ist_next
2650                 continue
2651             }
2652 
2653             // UNREACHABLE
2654             // println("PANIC3!! state: ${m_state}")
2655             return err_internal_error, state.i
2656         }
2657         // ist_quant_n => quantifier negative test on token
2658         else if m_state == .ist_quant_n {
2659             rep := re.prog[state.pc].rep
2660             // println("Here!! PC ${state.pc} is_next_or: ${re.prog[state.pc].next_is_or}")
2661 
2662             // zero quantifier * or ?
2663             if rep == 0 && re.prog[state.pc].rep_min == 0 {
2664                 // println("ist_quant_n c_zero RANGE MIN")
2665                 m_state = .ist_next // go to next ist
2666                 continue
2667             }
2668             // match + or *
2669             else if rep >= re.prog[state.pc].rep_min {
2670                 // println("ist_quant_n MATCH RANGE")
2671                 m_state = .ist_next
2672                 continue
2673             }
2674 
2675             // check the OR if present
2676             if re.prog[state.pc].next_is_or {
2677                 // println("OR present on failing")
2678                 state.match_index = -1
2679                 m_state = .ist_next
2680                 continue
2681             }
2682 
2683             // we are in a group manage no match from here
2684             if state.group_index >= 0 {
2685                 // println("ist_quant_n FAILED insied a GROUP group_index:${state.group_index}")
2686                 m_state = .ist_quant_ng
2687                 continue
2688             }
2689 
2690             // no other options
2691             // println("ist_quant_n no_match_found")
2692             result = no_match_found
2693             m_state = .stop
2694 
2695             // stop already started matching outside a capturing group
2696             if re.state_list.len > 0 && re.state_list.last().group_index == -1
2697                 && re.state_list.last().last_dot_pc > 0 {
2698                 if ist == ist_dot_char || ist == ist_bsls_char {
2699                     return no_match_found, 0
2700                 }
2701             }
2702             continue
2703         }
2704         // ist_quant_p => quantifier positive test on token
2705         else if m_state == .ist_quant_p {
2706             // println("Here .ist_quant_p")
2707             // exit on first match
2708             if (re.flag & f_efm) != 0 {
2709                 return state.i, state.i + 1
2710             }
2711 
2712             rep := re.prog[state.pc].rep
2713             // println("ist_quant_p rep: ${rep} rep_min: ${re.prog[state.pc].rep_min}")
2714 
2715             // under range
2716             if rep > 0 && rep < re.prog[state.pc].rep_min {
2717                 // println("ist_quant_p UNDER RANGE")
2718                 m_state = .ist_load // continue the loop
2719                 continue
2720             }
2721             // range ok, continue loop
2722             else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max {
2723                 // println("ist_quant_p IN RANGE")
2724 
2725                 // check greedy flag, if true exit on minimum
2726                 if re.prog[state.pc].greedy == true {
2727                     m_state = .ist_next
2728                     continue
2729                 }
2730                 m_state = .ist_load
2731                 continue
2732             }
2733             // max reached
2734             else if rep == re.prog[state.pc].rep_max {
2735                 // println("ist_quant_p MAX RANGE")
2736                 m_state = .ist_next
2737                 continue
2738             }
2739         }
2740         // UNREACHABLE
2741         // println("PANIC4!! state: ${m_state}")
2742         return err_internal_error, state.i
2743     }
2744 
2745     // println("Check end of text!")
2746     // Check the results
2747     if ist == ist_prog_end && state.first_match < 0 {
2748         return state.i, state.i
2749     }
2750     if state.match_index >= 0 {
2751         if state.group_index < 0 {
2752             if re.prog[state.pc].ist == ist_prog_end {
2753                 // println("program ended!!")
2754 
2755                 if (re.flag & f_src) != 0 {
2756                     // println("find return")
2757                     return state.first_match, state.i
2758                 } else {
2759                     // println("Here!!")
2760                     return 0, state.i
2761                 }
2762             }
2763 
2764             // println("No Group here, natural end [${state.first_match},${state.i}] state: ${state_str(m_state)} ist: ${ist} pgr_end: ${re.prog.len}")
2765 
2766             if re.prog[state.pc + 1].ist == ist_prog_end || re.prog[state.pc].ist == ist_prog_end {
2767                 rep := re.prog[state.pc].rep
2768                 // println("rep: ${rep} re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}")
2769                 if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max {
2770                     return state.first_match, state.i
2771                 }
2772                 // println("Program not finished! ")
2773                 return no_match_found, state.i
2774             }
2775             if src_end {
2776                 // println("program end")
2777                 return state.first_match, state.i
2778             }
2779             // print("No match found!!")
2780             return no_match_found, state.i
2781         } else {
2782             // println("Group match! OK")
2783             // println("first_match: ${state.first_match}, i: ${state.i}")
2784 
2785             // println("Skip last group")
2786             return state.first_match, state.i
2787             // return state.first_match,re.group_stack[state.group_index--]
2788         }
2789     }
2790     // println("no_match_found, natural end")
2791     return no_match_found, state.i
2792 }
2793