From 73060f1b1d711370a8031929f16e5c8624585038 Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Wed, 25 Mar 2026 22:52:34 +0300 Subject: [PATCH] vet: warn for confusing regex (fixes #26011) --- cmd/tools/vvet/tests/confusing_regex.out | 2 + cmd/tools/vvet/tests/confusing_regex.vv | 7 + .../tests/no_warn_about_unambiguous_regex.out | 1 + .../tests/no_warn_about_unambiguous_regex.vv | 8 ++ cmd/tools/vvet/vvet.v | 136 ++++++++++++++++++ 5 files changed, 154 insertions(+) create mode 100644 cmd/tools/vvet/tests/confusing_regex.out create mode 100644 cmd/tools/vvet/tests/confusing_regex.vv create mode 100644 cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.out create mode 100644 cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.vv diff --git a/cmd/tools/vvet/tests/confusing_regex.out b/cmd/tools/vvet/tests/confusing_regex.out new file mode 100644 index 000000000..6ec6f0dd3 --- /dev/null +++ b/cmd/tools/vvet/tests/confusing_regex.out @@ -0,0 +1,2 @@ +cmd/tools/vvet/tests/confusing_regex.vv:4: warning: Confusing regex `|` in `foo|bar`: V regex applies `|` to adjacent tokens, not whole branches. Use `(foo)|(bar)` if you intended alternation. +cmd/tools/vvet/tests/confusing_regex.vv:6: warning: Confusing regex `|` in `zip|zap`: V regex applies `|` to adjacent tokens, not whole branches. Use `(zip)|(zap)` if you intended alternation. diff --git a/cmd/tools/vvet/tests/confusing_regex.vv b/cmd/tools/vvet/tests/confusing_regex.vv new file mode 100644 index 000000000..6ca85e95a --- /dev/null +++ b/cmd/tools/vvet/tests/confusing_regex.vv @@ -0,0 +1,7 @@ +import regex + +fn main() { + _ := regex.regex_opt(r'foo|bar') or { panic(err) } + mut re := regex.new() + re.compile_opt(r'zip|zap') or { panic(err) } +} diff --git a/cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.out b/cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.out new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.out @@ -0,0 +1 @@ + diff --git a/cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.vv b/cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.vv new file mode 100644 index 000000000..b244259b9 --- /dev/null +++ b/cmd/tools/vvet/tests/no_warn_about_unambiguous_regex.vv @@ -0,0 +1,8 @@ +import regex + +fn main() { + _ := regex.regex_opt(r'a|b') or { panic(err) } + _ := regex.regex_opt(r'(foo)|(bar)') or { panic(err) } + _ := regex.regex_opt(r'[a|b]') or { panic(err) } + _ := regex.regex_opt(r'foo\\|bar') or { panic(err) } +} diff --git a/cmd/tools/vvet/vvet.v b/cmd/tools/vvet/vvet.v index d394d813d..0b1160601 100644 --- a/cmd/tools/vvet/vvet.v +++ b/cmd/tools/vvet/vvet.v @@ -19,8 +19,10 @@ mut: warns shared []VetError notices shared []VetError file string + mod string filtered_lines FilteredLines analyze VetAnalyze + regex_vars map[string]bool } struct Options { @@ -123,6 +125,8 @@ fn (mut vt Vet) vet_file(path string) { mut table := ast.new_table() vt.vprintln("vetting file '${path}'...") file := parser.parse_file(path, mut table, .parse_comments, prefs) + vt.mod = file.mod.name + vt.regex_vars = map[string]bool{} vt.stmts(file.stmts) source_lines := os.read_lines(vt.file) or { []string{} } for ln, line in source_lines { @@ -289,10 +293,13 @@ fn (mut vt Vet) stmt(stmt ast.Stmt) { ast.AssignStmt { vt.exprs(stmt.left) vt.exprs(stmt.right) + vt.track_regex_assign(stmt) vt.analyze.stmt(&vt, stmt) } ast.FnDecl { old_fn_decl := vt.analyze.cur_fn + old_regex_vars := vt.regex_vars.clone() + vt.regex_vars = map[string]bool{} vt.analyze.cur_fn = stmt vt.stmts(stmt.stmts) if vt.opt.fn_sizing { @@ -302,6 +309,7 @@ fn (mut vt Vet) stmt(stmt ast.Stmt) { vt.analyze.potential_non_inlined(mut vt, stmt) } vt.analyze.cur_fn = old_fn_decl + vt.regex_vars = old_regex_vars.clone() } ast.StructDecl { vt.exprs(stmt.fields.map(it.default_expr)) @@ -349,6 +357,7 @@ fn (mut vt Vet) expr(expr ast.Expr) { ast.CallExpr { vt.expr(expr.left) vt.exprs(expr.args.map(it.expr)) + vt.vet_confusing_regex(expr) vt.analyze.expr(&vt, expr) } ast.MatchExpr { @@ -401,6 +410,133 @@ fn (mut vt Vet) const_decl(stmt ast.ConstDecl) { } } +fn (mut vt Vet) track_regex_assign(stmt ast.AssignStmt) { + for i, left in stmt.left { + if i >= stmt.right.len { + break + } + mut ident_name := '' + match left { + ast.Ident { + ident_name = left.name + } + else { + continue + } + } + if vt.is_regex_value_expr(stmt.right[i]) { + vt.regex_vars[ident_name] = true + } else { + vt.regex_vars.delete(ident_name) + } + } +} + +fn (mut vt Vet) vet_confusing_regex(expr ast.CallExpr) { + if expr.args.len == 0 || !vt.is_regex_pattern_call(expr) { + return + } + pattern_expr := expr.args[0].expr + pattern := if pattern_expr is ast.StringLiteral { pattern_expr } else { return } + snippet, suggestion := confusing_regex_branch(pattern.val) or { return } + vt.warn('Confusing regex `|` in `${snippet}`: V regex applies `|` to adjacent tokens, not whole branches. Use `${suggestion}` if you intended alternation.', + pattern.pos.line_nr, .unknown) +} + +fn (vt &Vet) is_regex_pattern_call(expr ast.CallExpr) bool { + short_name := expr.name.all_after_last('.') + if short_name in ['regex_opt', 'regex_base'] { + return vt.is_regex_fn_call(expr) + } + if expr.name == 'compile_opt' && expr.is_method { + return vt.is_regex_value_expr(expr.left) + } + return false +} + +fn (vt &Vet) is_regex_value_expr(expr ast.Expr) bool { + match expr { + ast.CallExpr { + short_name := expr.name.all_after_last('.') + if short_name == 'new' { + return vt.is_regex_fn_call(expr) + } + if short_name == 'regex_opt' { + return vt.is_regex_fn_call(expr) + } + return false + } + ast.Ident { + return expr.name in vt.regex_vars + } + else { + return false + } + } +} + +fn (vt &Vet) is_regex_fn_call(expr ast.CallExpr) bool { + if expr.name in ['regex.regex_opt', 'regex.regex_base', 'regex.new'] { + return true + } + if vt.mod == 'regex' && expr.name in ['regex_opt', 'regex_base', 'new'] { + return true + } + return false +} + +fn confusing_regex_branch(pattern string) ?(string, string) { + mut escaped := false + mut in_char_class := false + for i := 0; i < pattern.len; i++ { + ch := pattern[i] + if escaped { + escaped = false + continue + } + if ch == `\\` { + escaped = true + continue + } + if in_char_class { + if ch == `]` { + in_char_class = false + } + continue + } + if ch == `[` { + in_char_class = true + continue + } + if ch != `|` || i == 0 || i + 1 >= pattern.len { + continue + } + if !is_regex_plain_letter(pattern[i - 1]) || !is_regex_plain_letter(pattern[i + 1]) { + continue + } + if !((i >= 2 && is_regex_plain_letter(pattern[i - 2])) + || (i + 2 < pattern.len && is_regex_plain_letter(pattern[i + 2]))) { + continue + } + mut left_start := i - 1 + for left_start > 0 && is_regex_plain_letter(pattern[left_start - 1]) { + left_start-- + } + mut right_end := i + 1 + for right_end + 1 < pattern.len && is_regex_plain_letter(pattern[right_end + 1]) { + right_end++ + } + left := pattern[left_start..i] + right := pattern[i + 1..right_end + 1] + return pattern[left_start..right_end + 1], '(${left})|(${right})' + } + return none +} + +fn is_regex_plain_letter(ch u8) bool { + return ch.is_letter() +} + fn (mut vt Vet) vet_empty_str(expr ast.InfixExpr) { if expr.left is ast.SelectorExpr && expr.right is ast.IntegerLiteral { operand := (expr.left as ast.SelectorExpr) // TODO: remove as-casts when multiple conds can be smart-casted. -- 2.39.5