v / vlib / regex / regex_util.v
596 lines · 539 sloc · 14.34 KB · 7dca3d91f8b54e78fe10ffb51e96c8d5e239f7c8
Raw
1/*
2regex 1.0 alpha
3
4Copyright (c) 2019-2024 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7*/
8module regex
9
10import strings
11
12/******************************************************************************
13*
14* Inits
15*
16******************************************************************************/
17// regex_base returns a regex object (`RE`) generated from `pattern` string and
18// detailed information in re_err, err_pos, if an error occurred.
19pub fn regex_base(pattern string) (RE, int, int) {
20 // init regex
21 mut re := RE{}
22 re.prog = []Token{len: pattern.len + 1} // max program length, can not be longer then the pattern
23 re.cc = []CharClass{len: pattern.len} // can not be more char class the length of the pattern
24 re.group_csave_flag = false // enable continuos group saving
25 re.group_max_nested = pattern.len >> 1 // set max 128 group nested
26 re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
27
28 re.group_stack = []int{len: re.group_max, init: -1}
29 re.group_data = []int{len: re.group_max, init: -1}
30
31 re_err, err_pos := re.impl_compile(pattern)
32 return re, re_err, err_pos
33}
34
35/******************************************************************************
36*
37* Utilities
38*
39******************************************************************************/
40// get_group_bounds_by_name get a group boundaries by its name
41pub fn (re &RE) get_group_bounds_by_name(group_name string) (int, int) {
42 if group_name in re.group_map {
43 tmp_index := re.group_map[group_name] - 1
44 start := re.groups[tmp_index * 2]
45 end := re.groups[tmp_index * 2 + 1]
46 return start, end
47 }
48 return -1, -1
49}
50
51// get_group_by_name get a group boundaries by its name
52pub fn (re &RE) get_group_by_name(in_txt string, group_name string) string {
53 if group_name in re.group_map {
54 tmp_index := re.group_map[group_name] - 1
55 start := re.groups[tmp_index * 2]
56 end := re.groups[tmp_index * 2 + 1]
57 if start >= 0 && end > start {
58 return in_txt[start..end]
59 }
60 }
61 return ''
62}
63
64// get_group_by_id get a group string by its id
65pub fn (re &RE) get_group_by_id(in_txt string, group_id int) string {
66 if group_id < (re.groups.len >> 1) {
67 index := group_id * 2
68 start := re.groups[index]
69 end := re.groups[index + 1]
70 if start >= 0 && end > start {
71 return in_txt[start..end]
72 }
73 }
74 return ''
75}
76
77// get_group_by_id get a group boundaries by its id
78pub fn (re &RE) get_group_bounds_by_id(group_id int) (int, int) {
79 if group_id < re.group_count {
80 index := group_id * 2
81 return re.groups[index], re.groups[index + 1]
82 }
83 return -1, -1
84}
85
86pub struct Re_group {
87pub:
88 start int = -1
89 end int = -1
90}
91
92// get_group_list return a list of Re_group for the found groups
93pub fn (re &RE) get_group_list() []Re_group {
94 mut res := []Re_group{len: re.groups.len >> 1}
95 mut gi := 0
96 // println("len: ${re.groups.len} groups: ${re.groups}")
97
98 for gi < re.groups.len {
99 if re.groups[gi] >= 0 {
100 txt_st := re.groups[gi]
101 txt_en := re.groups[gi + 1]
102
103 // println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ")
104 if txt_st >= 0 && txt_en > txt_st {
105 tmp := Re_group{
106 start: re.groups[gi]
107 end: re.groups[gi + 1]
108 }
109 // println(tmp)
110 res[gi >> 1] = tmp
111 } else {
112 res[gi >> 1] = Re_group{}
113 }
114 }
115 gi += 2
116 }
117 return res
118}
119
120/******************************************************************************
121*
122* Matchers
123*
124******************************************************************************/
125// match_string Match the pattern with the in_txt string
126@[direct_array_access]
127pub fn (re &RE) match_string(in_txt string) (int, int) {
128 unsafe {
129 start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
130 if end > in_txt.len {
131 end = in_txt.len
132 }
133
134 if start >= 0 && end >= start {
135 ok, _ := re.check_anchors(in_txt, start, end)
136 if ok {
137 return start, end
138 }
139 return no_match_found, 0
140 }
141 return start, end
142 }
143}
144
145// matches_string Checks if the pattern matches the in_txt string
146pub fn (re &RE) matches_string(in_txt string) bool {
147 start, _ := re.match_string(in_txt)
148 return start != no_match_found
149}
150
151/******************************************************************************
152*
153* Finders
154*
155******************************************************************************/
156@[direct_array_access; inline]
157fn (re &RE) check_anchors(in_txt string, start int, end int) (bool, bool) {
158 // `^` means start of the source string.
159 if (re.flag & f_ms) != 0 && start > 0 {
160 return false, true
161 }
162 // `$` means end of the source string, or right before a newline.
163 if (re.flag & f_me) != 0 && end < in_txt.len {
164 if in_txt[end] in new_line_list {
165 return true, false
166 }
167 // When `^` is also present, scanning forward can never recover.
168 return false, (re.flag & f_ms) != 0
169 }
170 return true, false
171}
172
173/*
174// find internal implementation HERE for reference do not remove!!
175@[direct_array_access]
176fn (mut re RE) find_imp(in_txt string) (int,int) {
177 old_flag := re.flag
178 re.flag |= f_src // enable search mode
179
180 start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
181 //print("Find [${start},${end}] '${in_txt[start..end]}'")
182 if end > in_txt.len {
183 end = in_txt.len
184 }
185 re.flag = old_flag
186
187 if start >= 0 && end > start {
188 return start, end
189 }
190 return no_match_found, 0
191}
192*/
193
194// find try to find the first match in the input string
195@[direct_array_access]
196pub fn (mut re RE) find(in_txt string) (int, int) {
197 // old_flag := re.flag
198 // re.flag |= f_src // enable search mode
199
200 mut i := 0
201 for i <= in_txt.len {
202 mut s := -1
203 mut e := -1
204 unsafe {
205 // tmp_str := tos(in_txt.str + i, in_txt.len - i)
206 // println("Check: [${tmp_str}]")
207 s, e = re.match_base(in_txt.str + i, in_txt.len - i + 1)
208
209 if s >= 0 && e >= s {
210 abs_start := i + s
211 abs_end := i + e
212 ok, stop_scan := re.check_anchors(in_txt, abs_start, abs_end)
213 if !ok {
214 if stop_scan {
215 break
216 }
217 i++
218 continue
219 }
220 // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
221 // re.flag = old_flag
222 mut gi := 0
223 for gi < re.groups.len {
224 re.groups[gi] += i
225 gi++
226 }
227 return abs_start, abs_end
228 }
229 i++
230 }
231 }
232 // re.flag = old_flag
233 return -1, -1
234}
235
236// find try to find the first match in the input string strarting from start index
237@[direct_array_access]
238pub fn (mut re RE) find_from(in_txt string, start int) (int, int) {
239 old_flag := re.flag
240 // re.flag |= f_src // enable search mode
241
242 mut i := start
243 if i < 0 {
244 return -1, -1
245 }
246 for i <= in_txt.len {
247 //--- speed references ---
248
249 mut s := -1
250 mut e := -1
251
252 unsafe {
253 tmp_str := tos(in_txt.str + i, in_txt.len - i)
254 s, e = re.match_string(tmp_str)
255 }
256 //------------------------
257 // s,e = re.find_imp(in_txt[i..])
258 //------------------------
259 if s >= 0 && e >= s {
260 abs_start := i + s
261 abs_end := i + e
262 ok, stop_scan := re.check_anchors(in_txt, abs_start, abs_end)
263 if !ok {
264 if stop_scan {
265 break
266 }
267 i++
268 continue
269 }
270 // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
271 re.flag = old_flag
272 mut gi := 0
273 for gi < re.groups.len {
274 re.groups[gi] += i
275 gi++
276 }
277 return abs_start, abs_end
278 } else {
279 i++
280 }
281 }
282 re.flag = old_flag
283 return -1, -1
284}
285
286// find_all find all the non overlapping occurrences of the match pattern and return the start and end index of the match
287//
288// Usage:
289// ```v
290// blurb := 'foobar boo steelbar toolbox foot tooooot'
291// mut re := regex.regex_opt('f|t[eo]+')?
292// res := re.find_all(blurb) // [0, 3, 12, 15, 20, 23, 28, 31, 33, 39]
293// ```
294@[direct_array_access]
295pub fn (mut re RE) find_all(in_txt string) []int {
296 // old_flag := re.flag
297 // re.flag |= f_src // enable search mode
298
299 mut i := 0
300 mut res := []int{}
301
302 for i <= in_txt.len {
303 mut s := -1
304 mut e := -1
305 unsafe {
306 // tmp_str := in_txt[i..]
307 // tmp_str := tos(in_txt.str + i, in_txt.len - i)
308 // println("Check: [${tmp_str}]")
309 s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i)
310
311 if s >= 0 && e >= s {
312 abs_start := i + s
313 abs_end := i + e
314 ok, stop_scan := re.check_anchors(in_txt, abs_start, abs_end)
315 if !ok {
316 if stop_scan {
317 break
318 }
319 i++
320 continue
321 }
322 res << abs_start
323 res << abs_end
324 if e > s {
325 i += e
326 } else {
327 i++
328 }
329 continue
330 }
331 /*
332 if e > 0 {
333 i += e
334 continue
335 }
336 */
337 i++
338 }
339 }
340 // re.flag = old_flag
341 return res
342}
343
344// split returns the sections of string around the regex
345//
346// Usage:
347// ```v
348// blurb := 'foobar boo steelbar toolbox foot tooooot'
349// mut re := regex.regex_opt('f|t[eo]+')?
350// res := re.split(blurb) // ['bar boo s', 'lbar ', 'lbox ', 't ', 't']
351// ```
352pub fn (mut re RE) split(in_txt string) []string {
353 pos := re.find_all(in_txt)
354
355 mut sections := []string{cap: pos.len / 2 + 1}
356
357 if pos.len == 0 {
358 return [in_txt]
359 }
360 for i := 0; i < pos.len; i += 2 {
361 if i == 0 {
362 sections << in_txt[..pos[i]]
363 } else {
364 sections << in_txt[pos[i - 1]..pos[i]]
365 }
366 }
367 sections << in_txt[pos[pos.len - 1]..]
368 return sections
369}
370
371// find_all_str find all the non overlapping occurrences of the match pattern, return a string list
372@[direct_array_access]
373pub fn (mut re RE) find_all_str(in_txt string) []string {
374 // old_flag := re.flag
375 // re.flag |= f_src // enable search mode
376
377 mut i := 0
378 mut res := []string{}
379
380 for i <= in_txt.len {
381 mut s := -1
382 mut e := -1
383 unsafe {
384 // tmp_str := in_txt[i..]
385 // tmp_str := tos(in_txt.str + i, in_txt.len - i)
386 // println("Check: [${tmp_str}]")
387 s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i)
388
389 if s >= 0 && e >= s {
390 abs_start := i + s
391 abs_end := i + e
392 ok, stop_scan := re.check_anchors(in_txt, abs_start, abs_end)
393 if !ok {
394 if stop_scan {
395 break
396 }
397 i++
398 continue
399 }
400 tmp_str := tos(in_txt.str + i, in_txt.len - i)
401 mut tmp_e := if e > tmp_str.len { tmp_str.len } else { e }
402 // println("Found: ${s}:${e} [${tmp_str[s..e]}]")
403 res << tmp_str[s..tmp_e]
404 if e > s {
405 i += e
406 } else {
407 i++
408 }
409 continue
410 }
411 }
412 /*
413 if e > 0 {
414 i += e
415 continue
416 }
417 */
418 i++
419 }
420 // re.flag = old_flag
421 return res
422}
423
424/******************************************************************************
425*
426* Replacers
427*
428******************************************************************************/
429// replace_simple return a string where the matches are replaced with the replace string
430pub fn (mut re RE) replace_simple(in_txt string, repl string) string {
431 pos := re.find_all(in_txt)
432
433 if pos.len > 0 {
434 mut res := ''
435 mut i := 0
436
437 mut s1 := 0
438 mut e1 := in_txt.len
439
440 for i < pos.len {
441 e1 = pos[i]
442 res += in_txt[s1..e1] + repl
443 s1 = pos[i + 1]
444 i += 2
445 }
446
447 res += in_txt[s1..]
448 return res
449 }
450 return in_txt
451}
452
453// type of function used for custom replace
454// in_txt source text
455// start index of the start of the match in in_txt
456// end index of the end of the match in in_txt
457// the match is in in_txt[start..end]
458pub type FnReplace = fn (re RE, in_txt string, start int, end int) string
459
460// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
461pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
462 mut i := 0
463 mut res := strings.new_builder(in_txt.len)
464 mut last_end := 0
465
466 for i < in_txt.len {
467 // println("Find Start. ${i} [${in_txt[i..]}]")
468 s, e := re.find_from(in_txt, i)
469 // println("Find End.")
470 if s >= 0 && e > s {
471 // println("find match in: ${s},${e} [${in_txt[s..e]}]")
472
473 if last_end < s {
474 res.write_string(in_txt[last_end..s])
475 }
476 /*
477 for g_i in 0 .. re.group_count {
478 re.groups[g_i * 2] += i
479 re.groups[(g_i * 2) + 1] += i
480 }
481 */
482 repl := repl_fn(re, in_txt, s, e)
483 // println("repl res: ${repl}")
484 res.write_string(repl)
485 // res.write_string("[[${in_txt[s..e]}]]")
486
487 last_end = e
488 i = e
489 } else {
490 break
491 // i++
492 }
493 // println(i)
494 }
495 if last_end >= 0 && last_end < in_txt.len {
496 res.write_string(in_txt[last_end..])
497 }
498 return res.str()
499}
500
501fn (re &RE) parsed_replace_string(in_txt string, repl string) string {
502 str_lst := repl.split('\\')
503 mut res := str_lst[0]
504 mut i := 1
505 for i < str_lst.len {
506 tmp := str_lst[i]
507 // println("tmp: ${tmp}")
508 if tmp.len > 0 && tmp[0] >= `0` && tmp[0] <= `9` {
509 group_id := int(tmp[0] - `0`)
510 group := re.get_group_by_id(in_txt, group_id)
511 // println("group: ${group_id} [${group}]")
512 res += '${group}${tmp[1..]}'
513 } else {
514 res += '\\' + tmp
515 }
516 i++
517 }
518 return res
519}
520
521// replace return a string where the matches are replaced with the repl_str string,
522// this function supports groups in the replace string
523pub fn (mut re RE) replace(in_txt string, repl_str string) string {
524 mut i := 0
525 mut res := strings.new_builder(in_txt.len)
526 mut last_end := 0
527
528 for i < in_txt.len {
529 // println("Find Start. ${i} [${in_txt[i..]}]")
530 s, e := re.find_from(in_txt, i)
531 // println("Find End.")
532 if s >= 0 && e > s {
533 // println("find match in: ${s},${e} [${in_txt[s..e]}]")
534
535 if last_end < s {
536 res.write_string(in_txt[last_end..s])
537 }
538 /*
539 for g_i in 0 .. re.group_count {
540 re.groups[g_i * 2] += i
541 re.groups[(g_i * 2) + 1] += i
542 }
543 */
544 // repl := repl_fn(re, in_txt, s, e)
545 repl := re.parsed_replace_string(in_txt, repl_str)
546 // println("repl res: ${repl}")
547 res.write_string(repl)
548 // res.write_string("[[${in_txt[s..e]}]]")
549
550 last_end = e
551 i = e
552 } else {
553 break
554 // i++
555 }
556 // println(i)
557 }
558 if last_end >= 0 && last_end < in_txt.len {
559 res.write_string(in_txt[last_end..])
560 }
561 return res.str()
562}
563
564// replace_n return a string where the first count matches are replaced with the repl_str string,
565// if count is > 0 the replace began from the start of the string toward the end
566// if count is < 0 the replace began from the end of the string toward the start
567// if count is 0 do nothing
568pub fn (mut re RE) replace_n(in_txt string, repl_str string, count int) string {
569 mut i := 0
570 mut index := 0
571 mut i_p := 0
572 mut res := strings.new_builder(in_txt.len)
573 mut lst := re.find_all(in_txt)
574
575 if count < 0 { // start from the right of the string
576 lst = unsafe { lst#[count * 2..] } // limitate the number of substitions
577 } else if count > 0 { // start from the left of the string
578 lst = unsafe { lst#[..count * 2] } // limitate the number of substitions
579 } else if count == 0 { // no replace
580 return in_txt
581 }
582
583 // println("found: ${lst}")
584 for index < lst.len {
585 i = lst[index]
586 res.write_string(in_txt[i_p..i])
587 res.write_string(repl_str)
588 index++
589 i_p = lst[index]
590 index++
591 }
592 i = i_p
593 res.write_string(in_txt[i..])
594
595 return res.str()
596}
597