v2 / vlib / strconv / atoi.v
371 lines · 336 sloc · 10.57 KB · 8986645ee93015bf9b44e7839c3a3370aff4f51b
Raw
1module strconv
2
3// Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
4// Use of this source code is governed by an MIT license
5// that can be found in the LICENSE file.
6// TODO: use options, or some way to return default with error.
7// int_size is the size in bits of an int or uint value.
8// int_size = 32 << (~u32(0) >> 63)
9// max_u64 = u64(u64(1 << 63) - 1)
10const int_size = 32
11
12@[inline]
13pub fn byte_to_lower(c u8) u8 {
14 return c | 32
15}
16
17// common_parse_uint is called by parse_uint and allows the parsing
18// to stop on non or invalid digit characters and return with an error
19pub fn common_parse_uint(s string, _base int, _bit_size int, error_on_non_digit bool, error_on_high_digit bool) !u64 {
20 result, err := common_parse_uint2(s, _base, _bit_size)
21 // TODO: error_on_non_digit and error_on_high_digit have no difference
22 if err != 0 && (error_on_non_digit || error_on_high_digit) {
23 match err {
24 -1 { return error('common_parse_uint: wrong base ${_base} for ${s}') }
25 -2 { return error('common_parse_uint: wrong bit size ${_bit_size} for ${s}') }
26 -3 { return error('common_parse_uint: integer overflow ${s}') }
27 else { return error('common_parse_uint: syntax error ${s}') }
28 }
29 }
30 return result
31}
32
33// the first returned value contains the parsed value,
34// the second returned value contains the error code (0 = OK, >1 = index of first non-parseable character + 1, -1 = wrong base, -2 = wrong bit size, -3 = overflow)
35@[direct_array_access]
36pub fn common_parse_uint2(s string, _base int, _bit_size int) (u64, int) {
37 if s == '' {
38 return u64(0), 1
39 }
40
41 mut bit_size := _bit_size
42 mut base := _base
43 mut start_index := 0
44
45 if base == 0 {
46 // Look for octal, binary and hex prefix.
47 base = 10
48 if s[0] == `0` {
49 ch := if s.len > 1 { s[1] | 32 } else { `0` }
50 if s.len >= 3 {
51 if ch == `b` {
52 base = 2
53 start_index += 2
54 } else if ch == `o` {
55 base = 8
56 start_index += 2
57 } else if ch == `x` {
58 base = 16
59 start_index += 2
60 }
61
62 // check for underscore after the base prefix
63 if s[start_index] == `_` {
64 start_index++
65 }
66 }
67 // manage leading zeros in decimal base's numbers
68 // otherwise it is an octal for C compatibility
69 // TODO: Check if this behaviour is logically right
70 else if s.len >= 2 && (s[1] >= `0` && s[1] <= `9`) {
71 base = 10
72 start_index++
73 } else {
74 base = 8
75 start_index++
76 }
77 }
78 }
79
80 if bit_size == 0 {
81 bit_size = int_size
82 } else if bit_size < 0 || bit_size > 64 {
83 return u64(0), -2
84 }
85 // Cutoff is the smallest number such that cutoff*base > maxUint64.
86 // Use compile-time constants for common cases.
87 cutoff := max_u64 / u64(base) + u64(1)
88 max_val := if bit_size == 64 { max_u64 } else { (u64(1) << u64(bit_size)) - u64(1) }
89 basem1 := base - 1
90
91 mut n := u64(0)
92 for i in start_index .. s.len {
93 mut c := s[i]
94
95 // manage underscore inside the number
96 if c == `_` {
97 if i == start_index || i >= (s.len - 1) {
98 // println("_ limit")
99 return u64(0), 1
100 }
101 if s[i - 1] == `_` || s[i + 1] == `_` {
102 // println("_ *2")
103 return u64(0), 1
104 }
105
106 continue
107 }
108
109 mut sub_count := 0
110
111 // get the 0-9 digit
112 c -= 48 // subtract the rune `0`
113
114 // check if we are in the superior base rune interval [A..Z]
115 if c >= 17 { // (65 - 48)
116 sub_count++
117 c -= 7 // subtract the `A` - `0` rune to obtain the value of the digit
118
119 // check if we are in the superior base rune interval [a..z]
120 if c >= 42 { // (97 - 7 - 48)
121 sub_count++
122 c -= 32 // subtract the `a` - `0` rune to obtain the value of the digit
123 }
124 }
125
126 // check for digit over base
127 if c > basem1 || (sub_count == 0 && c > 9) {
128 return n, i + 1
129 }
130
131 // check if we are in the cutoff zone
132 if n >= cutoff {
133 // n*base overflows
134 // return error('parse_uint: range error ${s}')
135 return max_val, -3
136 }
137 n *= u64(base)
138 n1 := n + u64(c)
139 if n1 < n || n1 > max_val {
140 // n+v overflows
141 // return error('parse_uint: range error ${s}')
142 return max_val, -3
143 }
144 n = n1
145 }
146 return n, 0
147}
148
149// parse_uint is like parse_int but for unsigned numbers.
150@[markused]
151pub fn parse_uint(s string, _base int, _bit_size int) !u64 {
152 return common_parse_uint(s, _base, _bit_size, true, true)
153}
154
155// common_parse_int is called by parse int and allows the parsing
156// to stop on non or invalid digit characters and return with an error
157@[direct_array_access]
158pub fn common_parse_int(_s string, base int, _bit_size int, error_on_non_digit bool, error_on_high_digit bool) !i64 {
159 if _s == '' {
160 // return error('parse_int: syntax error ${s}')
161 return i64(0)
162 }
163 mut bit_size := _bit_size
164 if bit_size == 0 {
165 bit_size = int_size
166 }
167 mut s := _s
168 // Pick off leading sign.
169 mut neg := false
170 if s[0] == `+` {
171 // s = s[1..]
172 unsafe {
173 s = tos(s.str + 1, s.len - 1)
174 }
175 } else if s[0] == `-` {
176 neg = true
177 // s = s[1..]
178 unsafe {
179 s = tos(s.str + 1, s.len - 1)
180 }
181 }
182
183 // Convert unsigned and check range.
184 // un := parse_uint(s, base, bit_size) or {
185 // return i64(0)
186 // }
187 un := common_parse_uint(s, base, bit_size, error_on_non_digit, error_on_high_digit)!
188 if un == 0 {
189 return i64(0)
190 }
191 // TODO: check should u64(bit_size-1) be size of int (32)?
192 cutoff := u64(1) << u64(bit_size - 1)
193 if !neg && un >= cutoff {
194 if error_on_high_digit {
195 return error('common_parse_int: integer overflow ${_s}')
196 }
197 return i64(cutoff - u64(1))
198 }
199 if neg && un > cutoff {
200 if error_on_high_digit {
201 return error('common_parse_int: integer overflow ${_s}')
202 }
203 return -i64(cutoff)
204 }
205 return if neg { -i64(un) } else { i64(un) }
206}
207
208// parse_int interprets a string s in the given base (0, 2 to 36) and
209// bit size (0 to 64) and returns the corresponding value i.
210//
211// If the base argument is 0, the true base is implied by the string's
212// prefix: 2 for "0b", 8 for "0" or "0o", 16 for "0x", and 10 otherwise.
213// Also, for argument base 0 only, underscore characters are permitted
214// as defined by the Go syntax for integer literals.
215//
216// The bitSize argument specifies the integer type
217// that the result must fit into. Bit sizes 0, 8, 16, 32, and 64
218// correspond to int, int8, int16, int32, and int64.
219// If bitSize is below 0 or above 64, an error is returned.
220pub fn parse_int(_s string, base int, _bit_size int) !i64 {
221 return common_parse_int(_s, base, _bit_size, true, false)
222}
223
224// atoi_common_check perform basics check on string to parse:
225// Test emptiness, + or - sign presence, presence of digit after signs and no
226// underscore as first character.
227// returns +1 or -1 depending on sign, and s first digit index or an error.
228@[direct_array_access]
229fn atoi_common_check(s string) !(i64, int) {
230 if s == '' {
231 return error('strconv.atoi: parsing "": empty string')
232 }
233
234 mut start_idx := 0
235 mut sign := i64(1)
236
237 if s[0] == `-` || s[0] == `+` {
238 start_idx++
239 if s[0] == `-` {
240 sign = -1
241 }
242 }
243
244 if s.len - start_idx < 1 {
245 return error('strconv.atoi: parsing "${s}": no number after sign')
246 }
247
248 if s[start_idx] == `_` || s[s.len - 1] == `_` {
249 return error('strconv.atoi: parsing "${s}": values cannot start or end with underscores')
250 }
251 return sign, start_idx
252}
253
254// atoi_common performs computation for all i8, i16 and i32 type, excluding i64.
255// Parse values, and returns consistent error message over differents types.
256// s is string to parse, type_min/max are respective types min/max values.
257@[direct_array_access]
258fn atoi_common(s string, type_min i64, type_max i64) !i64 {
259 mut sign, mut start_idx := atoi_common_check(s)!
260 mut x := i64(0)
261 mut underscored := false
262 for i in start_idx .. s.len {
263 c := s[i] - `0`
264 if c == 47 { // 47 = Ascii(`_`) - ascii(`0`) = 95 - 48.
265 if underscored == true { // Two consecutives underscore
266 return error('strconv.atoi: parsing "${s}": consecutives underscores are not allowed')
267 }
268 underscored = true
269 continue // Skip underscore
270 } else {
271 if c > 9 {
272 return error('strconv.atoi: parsing "${s}": invalid radix 10 character')
273 }
274 underscored = false
275 x = (x * 10) + (c * sign)
276 if sign == 1 && x > type_max {
277 return error('strconv.atoi: parsing "${s}": integer overflow')
278 } else {
279 if x < type_min {
280 return error('strconv.atoi: parsing "${s}": integer underflow')
281 }
282 }
283 }
284 }
285 return x
286}
287
288// atoi is equivalent to parse_int(s, 10, 0), converted to type int.
289// It follows V scanner as much as observed.
290@[markused]
291pub fn atoi(s string) !int {
292 return int(atoi_common(s, i64_min_int32, i64_max_int32)!)
293}
294
295// atoi8 is equivalent to atoi(s), converted to type i8.
296// returns an i8 [-128 .. 127] or an error.
297@[markused]
298pub fn atoi8(s string) !i8 {
299 return i8(atoi_common(s, min_i8, max_i8)!)
300}
301
302// atoi16 is equivalent to atoi(s), converted to type i16.
303// returns an i16 [-32678 .. 32767] or an error.
304@[markused]
305pub fn atoi16(s string) !i16 {
306 return i16(atoi_common(s, min_i16, max_i16)!)
307}
308
309// atoi32 is equivalent to atoi(s), converted to type i32.
310// returns an i32 [-2147483648 .. 2147483647] or an error.
311@[markused]
312pub fn atoi32(s string) !i32 {
313 return i32(atoi_common(s, min_i32, max_i32)!)
314}
315
316// atoi64 converts radix 10 string to i64 type.
317// returns an i64 [-9223372036854775808 .. 9223372036854775807] or an error.
318@[direct_array_access; markused]
319pub fn atoi64(s string) !i64 {
320 mut sign, mut start_idx := atoi_common_check(s)!
321 mut x := i64(0)
322 mut underscored := false
323 for i in start_idx .. s.len {
324 c := s[i] - `0`
325 if c == 47 { // 47 = Ascii(`_`) - ascii(`0`) = 95 - 48.
326 if underscored == true { // Two consecutives underscore
327 return error('strconv.atoi64: parsing "${s}": consecutives underscores are not allowed')
328 }
329 underscored = true
330 continue // Skip underscore
331 } else {
332 if c > 9 {
333 return error('strconv.atoi64: parsing "${s}": invalid radix 10 character')
334 }
335 underscored = false
336 x = safe_mul10_64bits(x) or { return error('strconv.atoi64: parsing "${s}": ${err}') }
337 x = safe_add_64bits(x, int(c * sign)) or {
338 return error('strconv.atoi64: parsing "${s}": ${err}')
339 }
340 }
341 }
342 return x
343}
344
345// safe_add64 performs a signed 64 bits addition and returns an error
346// in case of overflow or underflow.
347@[inline]
348fn safe_add_64bits(a i64, b i64) !i64 {
349 if a > 0 && b > (max_i64 - a) {
350 return error('integer overflow')
351 } else if a < 0 && b < (min_i64 - a) {
352 return error('integer underflow')
353 }
354 return a + b
355}
356
357// safe_mul10 performs a * 10 multiplication and returns an error
358// in case of overflow or underflow.
359@[inline]
360fn safe_mul10_64bits(a i64) !i64 {
361 if a > 0 && a > (max_i64 / 10) {
362 return error('integer overflow')
363 }
364 if a < 0 && a < (min_i64 / 10) {
365 return error('integer underflow')
366 }
367 return a * 10
368}
369
370const i64_min_int32 = i64(-2147483647) - 1 // msvc has a bug that treats just i64(min_int) as 2147483648 :-(; this is a workaround for it
371const i64_max_int32 = i64(2147483646) + 1
372