v2 / vlib / v / checker / str.v
362 lines · 346 sloc · 11.63 KB · a7039193c3425333ef1f754a2780111733af8167
Raw
1// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
2// Use of this source code is governed by an MIT license
3// that can be found in the LICENSE file.
4module checker
5
6import v.ast
7import v.token
8import encoding.utf8.validate
9import v.util
10
11fn (mut c Checker) get_default_fmt(ftyp ast.Type, typ ast.Type) u8 {
12 if ftyp.has_option_or_result() {
13 return `s`
14 } else if typ.is_float() {
15 return `g`
16 } else if typ.is_signed() || typ.is_int_literal() {
17 return `d`
18 } else if typ.is_unsigned() {
19 return `u`
20 } else if typ.is_pointer() {
21 return `p`
22 } else {
23 mut sym := c.table.sym(c.unwrap_generic(ftyp))
24 if sym.kind == .alias {
25 // string aliases should be printable
26 info := sym.info as ast.Alias
27 sym = c.table.sym(info.parent_type)
28 if info.parent_type == ast.string_type {
29 return `s`
30 }
31 }
32 if sym.kind == .function {
33 return `s`
34 }
35 if ftyp in [ast.string_type, ast.bool_type]
36 || sym.kind in [.enum, .array, .array_fixed, .struct, .generic_inst, .map, .multi_return, .sum_type, .interface, .aggregate, .none]
37 || ftyp.has_option_or_result() || sym.has_method('str') {
38 return `s`
39 } else {
40 return `_`
41 }
42 }
43}
44
45fn (mut c Checker) get_string_inter_default_fmt(expr ast.Expr, ftyp ast.Type, typ ast.Type) u8 {
46 if expr is ast.Ident {
47 if expr.obj is ast.Var {
48 obj := expr.obj
49 if obj.typ.is_ptr() && !obj.is_arg {
50 pointee_typ := obj.typ.deref()
51 if c.table.final_sym(pointee_typ).kind != .enum {
52 final_pointee_typ := c.table.final_type(pointee_typ)
53 if final_pointee_typ in [ast.string_type, ast.bool_type] {
54 return `p`
55 }
56 }
57 }
58 }
59 }
60 return c.get_default_fmt(ftyp, typ)
61}
62
63fn (mut c Checker) check_string_inter_lit_format_expr(mut expr ast.Expr, what string) {
64 if expr is ast.EmptyExpr {
65 return
66 }
67 expected_type := c.expected_type
68 c.expected_type = ast.int_type
69 mut typ := c.expr(mut expr)
70 c.expected_type = expected_type
71 typ = c.type_resolver.get_type_or_default(expr, c.check_expr_option_or_result_call(expr, typ))
72 typ = c.table.unalias_num_type(typ)
73 if typ != ast.int_type && !typ.is_int_literal() {
74 c.error('${what} expression should return `int`', expr.pos())
75 }
76}
77
78fn (mut c Checker) string_inter_lit(mut node ast.StringInterLiteral) ast.Type {
79 inside_interface_deref_save := c.inside_interface_deref
80 c.inside_interface_deref = true
81 for i, mut expr in node.exprs {
82 expected_type := c.expected_type
83 c.expected_type = ast.string_type
84 mut ftyp := c.expr(mut expr)
85 c.expected_type = expected_type
86 ftyp = c.type_resolver.get_type_or_default(expr,
87 c.check_expr_option_or_result_call(expr, ftyp))
88 if ftyp == ast.void_type || ftyp == 0 {
89 c.error('expression does not return a value', expr.pos())
90 } else if ftyp == ast.char_type && ftyp.nr_muls() == 0 {
91 c.error('expression returning type `char` cannot be used in string interpolation directly, print its address or cast it to an integer instead',
92 expr.pos())
93 } else if c.fail_if_private_implicit_str(ftyp, expr.pos(), 'interpolate') {
94 return ast.string_type
95 }
96 if ftyp == 0 {
97 return ast.void_type
98 }
99 c.markused_string_inter_lit(mut node, ftyp)
100 c.fail_if_unreadable(expr, ftyp, 'interpolation object')
101 node.expr_types << ftyp
102 if i < node.fwidth_exprs.len {
103 mut width_expr := node.fwidth_exprs[i]
104 c.check_string_inter_lit_format_expr(mut width_expr, 'width')
105 node.fwidth_exprs[i] = width_expr
106 }
107 if i < node.precision_exprs.len {
108 mut precision_expr := node.precision_exprs[i]
109 c.check_string_inter_lit_format_expr(mut precision_expr, 'precision')
110 node.precision_exprs[i] = precision_expr
111 }
112 ftyp_sym := c.table.sym(ftyp)
113 typ := if ftyp_sym.kind == .alias && !ftyp_sym.has_method('str') {
114 c.table.unalias_num_type(ftyp)
115 } else {
116 ftyp
117 }
118 mut fmt := node.fmts[i]
119 // During generic recheck, reset auto-determined format specifiers
120 // since the type may have changed between instantiations
121 if c.table.cur_concrete_types.len > 0 && !node.need_fmts[i] && fmt != `_` {
122 fmt = `_`
123 }
124 // analyze and validate format specifier
125 if fmt !in [`E`, `F`, `G`, `e`, `f`, `g`, `d`, `u`, `x`, `X`, `o`, `c`, `s`, `S`, `p`,
126 `b`, `_`, `r`, `R`] {
127 c.error('unknown format specifier `${fmt:c}`', node.fmt_poss[i])
128 }
129 if fmt == `_` { // set default representation for type if none has been given
130 fmt = c.get_string_inter_default_fmt(expr, ftyp, typ)
131 if fmt == `_` {
132 if typ != ast.void_type && !(typ.has_flag(.generic) && (c.inside_lambda
133 || c.table.cur_concrete_types.len > 0
134 || (c.table.cur_fn != unsafe { nil } && c.table.cur_fn.generic_names.len > 0))) {
135 c.error('no known default format for type `${c.table.get_type_name(ftyp)}`',
136 node.fmt_poss[i])
137 }
138 } else if c.comptime.is_comptime(expr)
139 && c.type_resolver.get_type_or_default(expr, ast.void_type) != ast.void_type {
140 // still `_` placeholder for comptime variable without specifier
141 node.need_fmts[i] = false
142 } else {
143 node.fmts[i] = fmt
144 node.need_fmts[i] = false
145 }
146 } else { // check if given format specifier is valid for type
147 has_dynamic_precision := i < node.precision_exprs.len
148 && node.precision_exprs[i] !is ast.EmptyExpr
149 if (node.precisions[i] != 987698 || has_dynamic_precision) && !typ.is_float() {
150 c.error('precision specification only valid for float types', node.fmt_poss[i])
151 }
152 if node.pluss[i] && !typ.is_number() {
153 c.error('plus prefix only allowed for numbers', node.fmt_poss[i])
154 }
155 if ((typ.is_unsigned() && fmt !in [`u`, `x`, `X`, `o`, `c`, `b`])
156 || (typ.is_signed() && fmt !in [`d`, `x`, `X`, `o`, `c`, `b`])
157 || (typ.is_int_literal() && fmt !in [`d`, `c`, `x`, `X`, `o`, `u`, `b`])
158 || (typ.is_float() && fmt !in [`E`, `F`, `G`, `e`, `f`, `g`])
159 || (typ.is_pointer() && fmt !in [`p`, `x`, `X`])
160 || (typ.is_string() && fmt !in [`s`, `S`, `r`, `R`])
161 || (typ.idx() in [ast.i64_type_idx, ast.f64_type_idx] && fmt == `c`))
162 && !(typ.is_ptr() && fmt in [`p`, `x`, `X`]) {
163 c.error('illegal format specifier `${fmt:c}` for type `${c.table.get_type_name(ftyp)}`',
164 node.fmt_poss[i])
165 }
166 if c.table.final_sym(typ).kind in [.array, .array_fixed, .struct, .interface, .none, .map, .sum_type]
167 && fmt in [`E`, `F`, `G`, `e`, `f`, `g`, `d`, `u`, `x`, `X`, `o`, `c`, `p`, `b`, `r`, `R`]
168 && !(typ.is_ptr() && fmt in [`p`, `x`, `X`]) {
169 c.error('illegal format specifier `${fmt:c}` for type `${c.table.get_type_name(ftyp)}`',
170 node.fmt_poss[i])
171 }
172 node.need_fmts[i] = fmt != c.get_default_fmt(ftyp, typ)
173 || (typ.is_float() && fmt in [`g`, `G`])
174 }
175 // check recursive str
176 if c.table.cur_fn != unsafe { nil } && c.table.cur_fn.is_method
177 && c.table.cur_fn.name == 'str' && c.table.cur_fn.receiver.name == '${expr}' {
178 c.error('cannot call `str()` method recursively', expr.pos())
179 }
180 }
181 c.inside_interface_deref = inside_interface_deref_save
182 if c.pref.warn_about_allocs {
183 c.warn_alloc('string interpolation', node.pos)
184 }
185 return ast.string_type
186}
187
188const unicode_lit_overflow_message = 'unicode character exceeds max allowed value of 0x10ffff, consider using a unicode literal (\\u####)'
189
190fn is_source_char_escaped(source string, idx int) bool {
191 mut backslashes := 0
192 mut i := idx - 1
193 for i >= 0 && source[i] == `\\` {
194 backslashes++
195 i--
196 }
197 return (backslashes & 1) == 1
198}
199
200fn raw_string_literal_source(source string, approx_pos int) ?string {
201 if source.len == 0 {
202 return none
203 }
204 mut hint := approx_pos
205 if hint < 0 {
206 hint = 0
207 } else if hint >= source.len {
208 hint = source.len - 1
209 }
210 mut start := hint
211 for start >= 0 {
212 if source[start] in [`'`, `"`] && !is_source_char_escaped(source, start) {
213 quote := source[start]
214 is_raw := start > 0 && source[start - 1] == `r`
215 mut end := start + 1
216 for end < source.len {
217 if source[end] == quote && (is_raw || !is_source_char_escaped(source, end)) {
218 if end >= hint {
219 return source[start..end + 1]
220 }
221 break
222 }
223 end++
224 }
225 }
226 start--
227 }
228 return none
229}
230
231fn (c &Checker) source_string_literal_is_valid_utf8(node ast.StringLiteral) bool {
232 if node.pos.file_idx < 0 || node.pos.file_idx >= c.table.filelist.len {
233 return validate.utf8_string(node.val)
234 }
235 source := util.read_file(c.table.filelist[node.pos.file_idx]) or {
236 return validate.utf8_string(node.val)
237 }
238 raw_source := raw_string_literal_source(source, node.pos.pos) or {
239 return validate.utf8_string(node.val)
240 }
241 return validate.utf8_string(raw_source)
242}
243
244// unicode character literals are limited to a maximum value of 0x10ffff
245// https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff
246@[direct_array_access]
247fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
248 // Validate the bytes that came from the source file, not the decoded string value.
249 // `\x..` escapes are allowed to produce arbitrary bytes intentionally.
250 valid_utf8 := c.source_string_literal_is_valid_utf8(node)
251 if !valid_utf8 {
252 c.note('invalid utf8 byte sequence in string literal', node.pos)
253 }
254 mut idx := 0
255 for idx < node.val.len {
256 match node.val[idx] {
257 `\\` {
258 mut start_pos := token.Pos{
259 ...node.pos
260 col: u16(node.pos.col + 1 + idx)
261 }
262 start_idx := idx
263 idx++
264 next_ch := node.val[idx] or { return ast.string_type }
265 if next_ch == `\\` {
266 // ignore escaping char
267 idx++
268 } else if next_ch == `u` {
269 idx++
270 mut ch := node.val[idx] or { return ast.string_type }
271 mut hex_char_count := 0
272 for ch.is_hex_digit() {
273 hex_char_count++
274 end_pos := token.Pos{
275 ...start_pos
276 len: idx + 1 - start_idx
277 }
278 match hex_char_count {
279 1...5 {}
280 6 {
281 first_digit := node.val[idx - 5] - 48
282 second_digit := node.val[idx - 4] - 48
283 if first_digit > 1 {
284 c.error(unicode_lit_overflow_message, end_pos)
285 } else if first_digit == 1 && second_digit > 0 {
286 c.error(unicode_lit_overflow_message, end_pos)
287 }
288 }
289 else {
290 c.error(unicode_lit_overflow_message, end_pos)
291 }
292 }
293
294 idx++
295 ch = node.val[idx] or { return ast.string_type }
296 }
297 }
298 }
299 else {
300 idx++
301 }
302 }
303 }
304 return ast.string_type
305}
306
307struct LoHiLimit {
308 lower string
309 higher string
310}
311
312const iencoding_map = {
313 `B`: LoHiLimit{'1000000000000000000000000000000000000000000000000000000000000000', '1111111111111111111111111111111111111111111111111111111111111111'}
314 `O`: LoHiLimit{'1000000000000000000000', '1777777777777777777777'}
315 `_`: LoHiLimit{'9223372036854775808', '18446744073709551615'}
316 `X`: LoHiLimit{'8000000000000000', 'FFFFFFFFFFFFFFFF'}
317}
318
319fn (mut c Checker) int_lit(mut node ast.IntegerLiteral) ast.Type {
320 if node.val.len < 17 {
321 // can not be a too large number, no need for more expensive checks
322 return ast.int_literal_type
323 }
324 lit := node.val.replace('_', '').all_after('-').to_upper_ascii()
325 is_neg := node.val.starts_with('-')
326 if lit.len > 2 && lit[0] == `0` && lit[1] in [`B`, `X`, `O`] {
327 if lohi := iencoding_map[lit[1]] {
328 c.check_num_literal(lohi, is_neg, lit[2..]) or { c.num_lit_overflow_error(node) }
329 }
330 } else {
331 lohi := iencoding_map[`_`]
332 c.check_num_literal(lohi, is_neg, lit) or { c.num_lit_overflow_error(node) }
333 }
334 return ast.int_literal_type
335}
336
337@[direct_array_access]
338fn (mut c Checker) check_num_literal(lohi LoHiLimit, is_neg bool, lit string) ! {
339 limit := if is_neg { lohi.lower } else { lohi.higher }
340 if lit.len < limit.len {
341 return
342 }
343 if lit.len > limit.len {
344 return error('length overflow')
345 }
346 if lit.len == limit.len {
347 for i, digit in lit {
348 if digit > limit[i] {
349 return error('value overflow at i: ${i}')
350 } else if digit < limit[i] {
351 break
352 }
353 }
354 }
355}
356
357fn (mut c Checker) num_lit_overflow_error(node &ast.IntegerLiteral) {
358 if c.inside_integer_literal_cast {
359 return
360 }
361 c.error('integer literal ${node.val} overflows int', node.pos)
362}
363