v / vlib / strconv / atoi.v
366 lines · 331 sloc · 10.5 KB · 4b7955afe528e233a43f8586b923529e6d28c391
Raw
1module strconv
2
3// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
4// Use of this source code is governed by an MIT license
5// that can be found in the LICENSE file.
6// TODO: use options, or some way to return default with error.
7// int_size is the size in bits of an int or uint value.
8// int_size = 32 << (~u32(0) >> 63)
9// max_u64 = u64(u64(1 << 63) - 1)
10const int_size = 32
11
12@[inline]
13pub fn byte_to_lower(c u8) u8 {
14 return c | 32
15}
16
17// common_parse_uint is called by parse_uint and allows the parsing
18// to stop on non or invalid digit characters and return with an error
19pub fn common_parse_uint(s string, _base int, _bit_size int, error_on_non_digit bool, error_on_high_digit bool) !u64 {
20 result, err := common_parse_uint2(s, _base, _bit_size)
21 // TODO: error_on_non_digit and error_on_high_digit have no difference
22 if err != 0 && (error_on_non_digit || error_on_high_digit) {
23 match err {
24 -1 { return error('common_parse_uint: wrong base ${_base} for ${s}') }
25 -2 { return error('common_parse_uint: wrong bit size ${_bit_size} for ${s}') }
26 -3 { return error('common_parse_uint: integer overflow ${s}') }
27 else { return error('common_parse_uint: syntax error ${s}') }
28 }
29 }
30 return result
31}
32
33// the first returned value contains the parsed value,
34// the second returned value contains the error code (0 = OK, >1 = index of first non-parseable character + 1, -1 = wrong base, -2 = wrong bit size, -3 = overflow)
35@[direct_array_access]
36pub fn common_parse_uint2(s string, _base int, _bit_size int) (u64, int) {
37 if s == '' {
38 return u64(0), 1
39 }
40
41 mut bit_size := _bit_size
42 mut base := _base
43 mut start_index := 0
44
45 if base == 0 {
46 // Look for octal, binary and hex prefix.
47 base = 10
48 if s[0] == `0` {
49 ch := if s.len > 1 { s[1] | 32 } else { `0` }
50 if s.len >= 3 {
51 if ch == `b` {
52 base = 2
53 start_index += 2
54 } else if ch == `o` {
55 base = 8
56 start_index += 2
57 } else if ch == `x` {
58 base = 16
59 start_index += 2
60 }
61
62 // check for underscore after the base prefix
63 if s[start_index] == `_` {
64 start_index++
65 }
66 }
67 // manage leading zeros in decimal base's numbers
68 // otherwise it is an octal for C compatibility
69 // TODO: Check if this behaviour is logically right
70 else if s.len >= 2 && (s[1] >= `0` && s[1] <= `9`) {
71 base = 10
72 start_index++
73 } else {
74 base = 8
75 start_index++
76 }
77 }
78 }
79
80 if bit_size == 0 {
81 bit_size = int_size
82 } else if bit_size < 0 || bit_size > 64 {
83 return u64(0), -2
84 }
85 // Cutoff is the smallest number such that cutoff*base > maxUint64.
86 // Use compile-time constants for common cases.
87 cutoff := max_u64 / u64(base) + u64(1)
88 max_val := if bit_size == 64 { max_u64 } else { (u64(1) << u64(bit_size)) - u64(1) }
89 basem1 := base - 1
90
91 mut n := u64(0)
92 for i in start_index .. s.len {
93 mut c := s[i]
94
95 // manage underscore inside the number
96 if c == `_` {
97 if i == start_index || i >= (s.len - 1) {
98 // println("_ limit")
99 return u64(0), 1
100 }
101 if s[i - 1] == `_` || s[i + 1] == `_` {
102 // println("_ *2")
103 return u64(0), 1
104 }
105
106 continue
107 }
108
109 mut sub_count := 0
110
111 // get the 0-9 digit
112 c -= 48 // subtract the rune `0`
113
114 // check if we are in the superior base rune interval [A..Z]
115 if c >= 17 { // (65 - 48)
116 sub_count++
117 c -= 7 // subtract the `A` - `0` rune to obtain the value of the digit
118
119 // check if we are in the superior base rune interval [a..z]
120 if c >= 42 { // (97 - 7 - 48)
121 sub_count++
122 c -= 32 // subtract the `a` - `0` rune to obtain the value of the digit
123 }
124 }
125
126 // check for digit over base
127 if c > basem1 || (sub_count == 0 && c > 9) {
128 return n, i + 1
129 }
130
131 // check if we are in the cutoff zone
132 if n >= cutoff {
133 // n*base overflows
134 // return error('parse_uint: range error ${s}')
135 return max_val, -3
136 }
137 n *= u64(base)
138 n1 := n + u64(c)
139 if n1 < n || n1 > max_val {
140 // n+v overflows
141 // return error('parse_uint: range error ${s}')
142 return max_val, -3
143 }
144 n = n1
145 }
146 return n, 0
147}
148
149// parse_uint is like parse_int but for unsigned numbers.
150pub fn parse_uint(s string, _base int, _bit_size int) !u64 {
151 return common_parse_uint(s, _base, _bit_size, true, true)
152}
153
154// common_parse_int is called by parse int and allows the parsing
155// to stop on non or invalid digit characters and return with an error
156@[direct_array_access]
157pub fn common_parse_int(_s string, base int, _bit_size int, error_on_non_digit bool, error_on_high_digit bool) !i64 {
158 if _s == '' {
159 // return error('parse_int: syntax error ${s}')
160 return i64(0)
161 }
162 mut bit_size := _bit_size
163 if bit_size == 0 {
164 bit_size = int_size
165 }
166 mut s := _s
167 // Pick off leading sign.
168 mut neg := false
169 if s[0] == `+` {
170 // s = s[1..]
171 unsafe {
172 s = tos(s.str + 1, s.len - 1)
173 }
174 } else if s[0] == `-` {
175 neg = true
176 // s = s[1..]
177 unsafe {
178 s = tos(s.str + 1, s.len - 1)
179 }
180 }
181
182 // Convert unsigned and check range.
183 // un := parse_uint(s, base, bit_size) or {
184 // return i64(0)
185 // }
186 un := common_parse_uint(s, base, bit_size, error_on_non_digit, error_on_high_digit)!
187 if un == 0 {
188 return i64(0)
189 }
190 // TODO: check should u64(bit_size-1) be size of int (32)?
191 cutoff := u64(1) << u64(bit_size - 1)
192 if !neg && un >= cutoff {
193 if error_on_high_digit {
194 return error('common_parse_int: integer overflow ${_s}')
195 }
196 return i64(cutoff - u64(1))
197 }
198 if neg && un > cutoff {
199 if error_on_high_digit {
200 return error('common_parse_int: integer overflow ${_s}')
201 }
202 return -i64(cutoff)
203 }
204 return if neg { -i64(un) } else { i64(un) }
205}
206
207// parse_int interprets a string s in the given base (0, 2 to 36) and
208// bit size (0 to 64) and returns the corresponding value i.
209//
210// If the base argument is 0, the true base is implied by the string's
211// prefix: 2 for "0b", 8 for "0" or "0o", 16 for "0x", and 10 otherwise.
212// Also, for argument base 0 only, underscore characters are permitted
213// as defined by the Go syntax for integer literals.
214//
215// The bitSize argument specifies the integer type
216// that the result must fit into. Bit sizes 0, 8, 16, 32, and 64
217// correspond to int, int8, int16, int32, and int64.
218// If bitSize is below 0 or above 64, an error is returned.
219pub fn parse_int(_s string, base int, _bit_size int) !i64 {
220 return common_parse_int(_s, base, _bit_size, true, false)
221}
222
223// atoi_common_check perform basics check on string to parse:
224// Test emptiness, + or - sign presence, presence of digit after signs and no
225// underscore as first character.
226// returns +1 or -1 depending on sign, and s first digit index or an error.
227@[direct_array_access]
228fn atoi_common_check(s string) !(i64, int) {
229 if s == '' {
230 return error('strconv.atoi: parsing "": empty string')
231 }
232
233 mut start_idx := 0
234 mut sign := i64(1)
235
236 if s[0] == `-` || s[0] == `+` {
237 start_idx++
238 if s[0] == `-` {
239 sign = -1
240 }
241 }
242
243 if s.len - start_idx < 1 {
244 return error('strconv.atoi: parsing "${s}": no number after sign')
245 }
246
247 if s[start_idx] == `_` || s[s.len - 1] == `_` {
248 return error('strconv.atoi: parsing "${s}": values cannot start or end with underscores')
249 }
250 return sign, start_idx
251}
252
253// atoi_common performs computation for all i8, i16 and i32 type, excluding i64.
254// Parse values, and returns consistent error message over differents types.
255// s is string to parse, type_min/max are respective types min/max values.
256@[direct_array_access]
257fn atoi_common(s string, type_min i64, type_max i64) !i64 {
258 mut sign, mut start_idx := atoi_common_check(s)!
259 mut x := i64(0)
260 mut underscored := false
261 for i in start_idx .. s.len {
262 c := s[i] - `0`
263 if c == 47 { // 47 = Ascii(`_`) - ascii(`0`) = 95 - 48.
264 if underscored == true { // Two consecutives underscore
265 return error('strconv.atoi: parsing "${s}": consecutives underscores are not allowed')
266 }
267 underscored = true
268 continue // Skip underscore
269 } else {
270 if c > 9 {
271 return error('strconv.atoi: parsing "${s}": invalid radix 10 character')
272 }
273 underscored = false
274 x = (x * 10) + (c * sign)
275 if sign == 1 && x > type_max {
276 return error('strconv.atoi: parsing "${s}": integer overflow')
277 } else {
278 if x < type_min {
279 return error('strconv.atoi: parsing "${s}": integer underflow')
280 }
281 }
282 }
283 }
284 return x
285}
286
287// atoi is equivalent to parse_int(s, 10, 0), converted to type int.
288// It follows V scanner as much as observed.
289pub fn atoi(s string) !int {
290 return int(atoi_common(s, i64_min_int32, i64_max_int32)!)
291}
292
293// atoi8 is equivalent to atoi(s), converted to type i8.
294// returns an i8 [-128 .. 127] or an error.
295pub fn atoi8(s string) !i8 {
296 return i8(atoi_common(s, min_i8, max_i8)!)
297}
298
299// atoi16 is equivalent to atoi(s), converted to type i16.
300// returns an i16 [-32678 .. 32767] or an error.
301pub fn atoi16(s string) !i16 {
302 return i16(atoi_common(s, min_i16, max_i16)!)
303}
304
305// atoi32 is equivalent to atoi(s), converted to type i32.
306// returns an i32 [-2147483648 .. 2147483647] or an error.
307pub fn atoi32(s string) !i32 {
308 return i32(atoi_common(s, min_i32, max_i32)!)
309}
310
311// atoi64 converts radix 10 string to i64 type.
312// returns an i64 [-9223372036854775808 .. 9223372036854775807] or an error.
313@[direct_array_access]
314pub fn atoi64(s string) !i64 {
315 mut sign, mut start_idx := atoi_common_check(s)!
316 mut x := i64(0)
317 mut underscored := false
318 for i in start_idx .. s.len {
319 c := s[i] - `0`
320 if c == 47 { // 47 = Ascii(`_`) - ascii(`0`) = 95 - 48.
321 if underscored == true { // Two consecutives underscore
322 return error('strconv.atoi64: parsing "${s}": consecutives underscores are not allowed')
323 }
324 underscored = true
325 continue // Skip underscore
326 } else {
327 if c > 9 {
328 return error('strconv.atoi64: parsing "${s}": invalid radix 10 character')
329 }
330 underscored = false
331 x = safe_mul10_64bits(x) or { return error('strconv.atoi64: parsing "${s}": ${err}') }
332 x = safe_add_64bits(x, int(c * sign)) or {
333 return error('strconv.atoi64: parsing "${s}": ${err}')
334 }
335 }
336 }
337 return x
338}
339
340// safe_add64 performs a signed 64 bits addition and returns an error
341// in case of overflow or underflow.
342@[inline]
343fn safe_add_64bits(a i64, b i64) !i64 {
344 if a > 0 && b > (max_i64 - a) {
345 return error('integer overflow')
346 } else if a < 0 && b < (min_i64 - a) {
347 return error('integer underflow')
348 }
349 return a + b
350}
351
352// safe_mul10 performs a * 10 multiplication and returns an error
353// in case of overflow or underflow.
354@[inline]
355fn safe_mul10_64bits(a i64) !i64 {
356 if a > 0 && a > (max_i64 / 10) {
357 return error('integer overflow')
358 }
359 if a < 0 && a < (min_i64 / 10) {
360 return error('integer underflow')
361 }
362 return a * 10
363}
364
365const i64_min_int32 = i64(-2147483647) - 1 // msvc has a bug that treats just i64(min_int) as 2147483648 :-(; this is a workaround for it
366const i64_max_int32 = i64(2147483646) + 1
367