v2 / vlib / encoding / csv / csv_reader_random_access.v
580 lines · 519 sloc · 15.46 KB · 089778e55ee41dc89ce95eac9337740d35383acb
Raw
1/*
2csv random access reader 1.0 alpha
3
4Copyright (c) 2023 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8Known limitations:
9- no stream reading
10*/
11module csv
12
13import os
14
15/******************************************************************************
16*
17* Consts
18*
19******************************************************************************/
20// endline lengths
21pub const endline_cr_len = 1
22pub const endline_crlf_len = 2
23
24// Type of read buffer
25pub const ram_csv = 1
26pub const file_csv = 0
27
28/******************************************************************************
29*
30* Structs
31*
32******************************************************************************/
33pub enum ColumType {
34 string = 0
35 int = 1
36 f32 = 2
37}
38
39pub struct HeaderItem {
40pub mut:
41 label string
42 column int
43 htype ColumType = .string
44}
45
46@[heap]
47pub struct RandomAccessReader {
48pub mut:
49 index i64
50
51 f os.File
52 f_len i64
53 is_bom_present bool
54
55 start_index i64
56 end_index i64 = -1
57
58 end_line u8 = `\n`
59 end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
60 separator u8 = `,` // comma is the default separator
61 separator_len int = 1 // size of the separator rune
62 quote u8 = `"` // double quote is the standard quote char
63 quote_remove bool // if true clear the cell from the quotes
64 comment u8 = `#` // every line that start with the quote char is ignored
65
66 default_cell string = '*' // return this string if out of the csv boundaries
67 empty_cell string = '#' // retunrn this if empty cell
68 // ram buffer
69 mem_buf_type u32 // buffer type 0=File,1=RAM
70 mem_buf voidptr // buffer used to load chars from file
71 mem_buf_size i64 // size of the buffer
72 mem_buf_start i64 = -1 // start index in the file of the read buffer
73 mem_buf_end i64 = -1 // end index in the file of the read buffer
74 // csv map for quick access
75 create_map_csv bool = true // flag to enable the csv map creation
76 csv_map [][]i64
77 // header
78 header_row int = -1 // row index of the header in the csv_map
79 header_list []HeaderItem // list of the header item
80 header_map map[string]int // map from header label to column index
81}
82
83@[params]
84pub struct RandomAccessReaderConfig {
85pub:
86 scr_buf voidptr // pointer to the buffer of data
87 scr_buf_len i64 // if > 0 use the RAM pointed from scr_buf as source of data
88 file_path string
89 start_index i64
90 end_index i64 = -1
91 mem_buf_size int = 1024 * 64 // default buffer size 64KByte
92 separator u8 = `,`
93 comment u8 = `#` // every line that start with the quote char is ignored
94 default_cell string = '*' // return this string if out of the csv boundaries
95 empty_cell string // return this string if empty cell
96 end_line_len int = endline_cr_len // size of the endline rune
97 quote u8 = `"` // double quote is the standard quote char
98 quote_remove bool // if true clear the cell from the quotes
99 create_map_csv bool = true // if true make the map of the csv file
100}
101
102/******************************************************************************
103*
104* Init, dispose, fill buffer
105*
106******************************************************************************/
107
108// csv_reader_from_string create a csv reader from a string
109pub fn csv_reader_from_string(in_str string) !&RandomAccessReader {
110 return csv_reader(RandomAccessReaderConfig{ scr_buf: in_str.str, scr_buf_len: in_str.len })!
111}
112
113// csv_reader create a random access csv reader
114pub fn csv_reader(cfg RandomAccessReaderConfig) !&RandomAccessReader {
115 mut cr := &RandomAccessReader{}
116
117 cr.start_index = cfg.start_index
118 cr.end_index = cfg.end_index
119
120 if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
121 cr.mem_buf_type = ram_csv // RAM buffer
122 cr.mem_buf = cfg.scr_buf
123 cr.mem_buf_size = cfg.scr_buf_len
124 if cfg.end_index == -1 {
125 cr.end_index = cfg.scr_buf_len
126 }
127
128 // check if BOM header is in the memory buffer
129 unsafe {
130 if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
131 && *(&u8(cr.mem_buf) + 2) == 0xBF {
132 cr.is_bom_present = true
133 cr.index += 3 // skip the BOM
134 }
135 }
136 }
137 // check if is a file source
138 else if cfg.file_path.len > 0 {
139 if !os.exists(cfg.file_path) {
140 return error('ERROR: file ${cfg.file_path} not found!')
141 }
142 cr.mem_buf_type = file_csv // File buffer
143 // allocate the memory
144 unsafe {
145 cr.mem_buf = malloc(cfg.mem_buf_size)
146 cr.mem_buf_size = cfg.mem_buf_size
147 }
148 cr.f = os.open_file(cfg.file_path, 'rb')!
149
150 cr.f.seek(0, .end)!
151 cr.f_len = cr.f.tell()!
152
153 cr.f.seek(cfg.start_index, .start)!
154 cr.index = cr.f.tell()!
155
156 if cfg.end_index == -1 {
157 cr.end_index = cr.f_len
158 }
159
160 // check if BOM header is in the file
161 if cr.index == 0 {
162 if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
163 unsafe {
164 if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
165 && *(&u8(cr.mem_buf) + 2) == 0xBF {
166 cr.is_bom_present = true
167 cr.index += 3 // skip the BOM
168 }
169 }
170 }
171 cr.f.seek(cfg.start_index, .start)!
172 }
173 }
174
175 cr.default_cell = cfg.default_cell
176 cr.empty_cell = cfg.empty_cell
177 cr.end_line_len = cfg.end_line_len
178 cr.separator = cfg.separator
179 cr.comment = cfg.comment
180 cr.quote_remove = cfg.quote_remove
181 cr.quote = cfg.quote
182
183 cr.create_map_csv = cfg.create_map_csv
184 if cr.create_map_csv {
185 cr.map_csv()!
186 }
187
188 return cr
189}
190
191// dispose_csv_reader release the resources used by the csv_reader
192pub fn (mut cr RandomAccessReader) dispose_csv_reader() {
193 if cr.mem_buf_type == ram_csv {
194 // do nothing, ram buffer is static
195 } else if cr.mem_buf_type == file_csv {
196 // file close
197 if cr.f.is_opened {
198 cr.f.close()
199 }
200
201 // free the allocated memory
202 if cr.mem_buf_size > 0 {
203 unsafe {
204 free(cr.mem_buf)
205 }
206 cr.mem_buf = unsafe { nil }
207 cr.mem_buf_size = 0
208 }
209 }
210}
211
212fn (mut cr RandomAccessReader) fill_buffer(i i64) !i64 {
213 // use ram
214 if cr.mem_buf_type == ram_csv {
215 // do nothing, ram buffer are static for now
216 cr.mem_buf_start = i
217 cr.mem_buf_end = cr.mem_buf_size
218 read_bytes_count := cr.mem_buf_end - cr.mem_buf_start
219 // println("fill_buffer RAM: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
220 return i64(read_bytes_count)
221 // use file
222 } else if cr.mem_buf_type == file_csv {
223 cr.start_index = i
224 cr.f.seek(cr.start_index, .start)!
225 // IMPORTANT: add 64 bit support in vlib!!
226 read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
227 cr.mem_buf_start = i
228 cr.mem_buf_end = i + read_bytes_count
229 // println("fill_buffer FILE: ${i} read_bytes_count: ${read_bytes_count} mem_buf_start: ${cr.mem_buf_start} mem_buf_end: ${cr.mem_buf_end}")
230 return i64(read_bytes_count)
231 }
232 return i64(-1)
233}
234
235// copy_configuration copies the configuration from another csv RandomAccessReader
236// this function is a helper for using the RandomAccessReader in multi threaded applications
237// pay attention to the free process
238pub fn (mut cr RandomAccessReader) copy_configuration(src_cr RandomAccessReader) {
239 cr.header_row = src_cr.header_row
240 unsafe {
241 cr.header_list = &src_cr.header_list
242 cr.header_map = &src_cr.header_map
243 cr.csv_map = &src_cr.csv_map
244 }
245}
246
247/******************************************************************************
248*
249* Csv mapper, mapped reader
250*
251******************************************************************************/
252// map_csv create an index of whole csv file to consent random access to every cell in the file
253pub fn (mut cr RandomAccessReader) map_csv() ! {
254 mut count := 0
255 mut i := i64(0)
256 mut capture_flag := true
257 mut drop_row := false
258 mut quote_flag := false // true if we are parsing inside a quote
259
260 // if File return to the start of the file
261 if cr.mem_buf_type == file_csv {
262 cr.f.seek(cr.start_index, .start)!
263 }
264
265 unsafe {
266 p := &u8(cr.mem_buf)
267 cr.csv_map << []i64{}
268 cr.csv_map[0] << if cr.is_bom_present { 3 } else { 0 } // skip the BOM data
269
270 // mut counter := i64(0)
271 for i < cr.end_index {
272 read_bytes_count := cr.fill_buffer(i)!
273
274 // DEBUG print
275 // perc := f32(counter) / f32(cr.end_index) * 100.0
276 // println("${perc:.2f}")
277
278 // println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
279 mut p1 := p
280 mut i1 := i64(0)
281 for i1 < read_bytes_count {
282 // println("loop char: ${*&u8(p1):c}")
283 // manage quote char
284 if *p1 == cr.quote {
285 quote_flag = !quote_flag
286 p1++
287 i1++
288 }
289 else if // manage comment line
290 !quote_flag && *p1 == cr.comment && cr.csv_map[cr.csv_map.len - 1].len <= 1 {
291 drop_row = true
292 p1++
293 i1++
294 // println("drop_row: ${cr.csv_map.len - 1}")
295 }
296 else if // capture separator
297 !quote_flag && capture_flag && *p1 == cr.separator && !drop_row {
298 cr.csv_map[cr.csv_map.len - 1] << (i + i1)
299
300 p1 += cr.separator_len
301 i1 += cr.separator_len
302 }
303 else if // capture end line
304 *p1 == cr.end_line {
305 if quote_flag {
306 error_col := cr.csv_map[cr.csv_map.len - 1].last() - cr.csv_map[cr.csv_map.len - 1].first()
307 return error('ERROR: quote not closed at row ${count} after column ${error_col}!')
308 }
309 count++
310
311 cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
312 p1 += cr.end_line_len
313 i1 += cr.end_line_len
314
315 if drop_row == true {
316 cr.csv_map[cr.csv_map.len - 1].clear()
317 drop_row = false
318 } else {
319 // skip empty rows
320 if cr.csv_map[cr.csv_map.len - 1].len == 2
321 && cr.csv_map[cr.csv_map.len - 1][0] == cr.csv_map[cr.csv_map.len - 1][1] {
322 // recycle the row
323 cr.csv_map[cr.csv_map.len - 1].clear()
324 } else {
325 // it all ok, insert a new row
326 cr.csv_map << []i64{cap: cr.csv_map[cr.csv_map.len - 1].len}
327 }
328 }
329
330 cr.csv_map[cr.csv_map.len - 1] << (i + i1) - (cr.end_line_len - 1)
331
332 p1 -= (cr.end_line_len - 1)
333 i1 -= (cr.end_line_len - 1)
334
335 // DEBUG checks
336 // r := &u8(cr.mem_buf) + (i + i1) - (cr.end_line_len - 1)
337 // r := p1
338 // println("char: ${*r:c}")
339 } else {
340 p1++
341 i1++
342 }
343 }
344 i += read_bytes_count
345 // counter += i1
346 }
347 }
348 // remove last row if it is not a valid one
349 if cr.csv_map[cr.csv_map.len - 1].len < 2 {
350 cr.csv_map.delete(cr.csv_map.len - 1)
351 }
352
353 // if File return to the start of the file
354 if cr.mem_buf_type == file_csv {
355 cr.f.seek(cr.start_index, .start)!
356 }
357
358 // println("map_csv Done! ${count}")
359}
360
361// get_row get a row from the CSV file as a string array
362pub fn (mut cr RandomAccessReader) get_row(y int) ![]string {
363 mut h := []string{}
364 if cr.csv_map.len > 1 {
365 for x in 0 .. (cr.csv_map[y].len - 1) {
366 h << cr.get_cell(x: x, y: y)!
367 }
368 }
369 return h
370}
371
372@[params]
373pub struct GetCellConfig {
374pub:
375 x int
376 y int
377}
378
379// get_cell read a single cel nd return a string
380pub fn (mut cr RandomAccessReader) get_cell(cfg GetCellConfig) !string {
381 if cfg.y < cr.csv_map.len && cfg.x < (cr.csv_map[cfg.y].len - 1) {
382 mut start := cr.csv_map[cfg.y][cfg.x]
383 mut end := cr.csv_map[cfg.y][cfg.x + 1]
384
385 if cfg.x > 0 {
386 start++
387 }
388
389 mut len := end - start
390 // println("len calc: ${len}")
391 if len <= 0 {
392 return cr.empty_cell
393 }
394
395 // fill the buffer if needed
396 if !(start >= cr.mem_buf_start && end < cr.mem_buf_end) {
397 cr.fill_buffer(start)!
398 }
399 unsafe {
400 // execute this section only if we need to remove the quotes
401 if cr.quote_remove {
402 // println("[${start},${end}] len:${len}")
403 // remove front quote and spaces
404 mut tmp_p := &u8(cr.mem_buf) + start - cr.start_index
405 for start < end {
406 if *tmp_p == cr.quote {
407 start++
408 break
409 }
410 start++
411 tmp_p++
412 }
413 // println("after start quote filtering [${start},${end}] len:${len}")
414 // remove back quote and spaces
415 tmp_p = &u8(cr.mem_buf) + end - cr.start_index
416 for end > start {
417 if *tmp_p == cr.quote {
418 break
419 }
420 tmp_p--
421 end--
422 }
423 // println("after end quote filtering [${start},${end}] len:${len}")
424
425 len = end - start
426 // println("len calc2: ${len}")
427 if len <= 0 {
428 return cr.empty_cell
429 }
430 // println("[${start},${end}] len:${len}")
431 }
432
433 // create the string from the buffer
434 mut tmp_mem := malloc_noscan(isize(len + 1))
435 /*
436 defer {
437 free(tmp_mem)
438 }
439 */
440 mem_start := &u8(cr.mem_buf) + start - cr.start_index
441 vmemcpy(tmp_mem, mem_start, isize(len))
442 tmp_mem[len] = 0 // 0 for C string compatibility
443 ret_str := tos(tmp_mem, int(len))
444 return ret_str
445 }
446 }
447 return cr.default_cell
448}
449
450pub type CellValue = f32 | int | string
451
452// get_cellt read a single cell and return a sum type CellValue
453pub fn (mut cr RandomAccessReader) get_cellt(cfg GetCellConfig) !CellValue {
454 if cr.header_row >= 0 && cfg.x < cr.header_list.len {
455 h := cr.header_list[cfg.x]
456 res := cr.get_cell(cfg)!
457 match h.htype {
458 .int {
459 return res.trim_space().int()
460 }
461 .string {
462 return res
463 }
464 .f32 {
465 return res.trim_space().f32()
466 }
467 }
468 }
469 return cr.get_cell(cfg)!
470}
471
472/******************************************************************************
473*
474* Header management
475*
476******************************************************************************/
477@[params]
478pub struct GetHeaderConf {
479pub:
480 header_row int // row where to inspect the header
481}
482
483// build_header_dict infer the header, it use the first available row in not row number is passesd
484// it try to infer the type of column using the first available row after the header
485// By default all the column are of the string type
486pub fn (mut cr RandomAccessReader) build_header_dict(cfg GetHeaderConf) ! {
487 if cr.csv_map.len > 1 && cfg.header_row >= 0 && cfg.header_row < cr.csv_map.len {
488 cr.header_row = cfg.header_row
489 for col in 0 .. (cr.csv_map[cfg.header_row].len - 1) {
490 // fill the base struct
491 label := cr.get_cell(x: col, y: cfg.header_row)!
492 mut h := HeaderItem{
493 label: label
494 column: col
495 htype: .string
496 }
497
498 // try to infer the type if we haev at least one more row
499 if cfg.header_row + 1 < cr.csv_map.len {
500 x := cr.get_cell(x: col, y: cfg.header_row + 1)!.trim_space().to_lower()
501 mut sign_c := int(0)
502 mut int_c := int(0)
503 mut float_c := int(0)
504 mut alpha_c := int(0)
505 mut htype := ColumType.string
506 // raw extimation fo the type
507 for c in x {
508 if c in [`+`, `-`] {
509 sign_c++
510 continue
511 }
512 if c >= `0` && c <= `9` {
513 int_c++
514 continue
515 }
516 if c == `.` {
517 float_c++
518 continue
519 }
520 if c in [`e`, `E`] && (float_c > 0 || int_c > 0) {
521 float_c++
522 continue
523 }
524 alpha_c++
525 break
526 }
527
528 // if no alpha_c can be and int or a float
529 if alpha_c == 0 {
530 if float_c > 0 {
531 htype = .f32
532 } else {
533 htype = .int
534 }
535 }
536 h.htype = htype
537 }
538
539 cr.header_list << h
540 cr.header_map[label] = col
541 }
542 }
543}
544
545/******************************************************************************
546*
547* Utility function
548*
549******************************************************************************/
550// rows_count count the rows in the csv between start_index and end_index
551pub fn (mut cr RandomAccessReader) rows_count() !i64 {
552 mut count := i64(0)
553 mut i := i64(0)
554
555 if cr.mem_buf_type == file_csv {
556 cr.f.seek(cr.start_index, .start)!
557 }
558 unsafe {
559 p := &u8(cr.mem_buf)
560 for i < cr.end_index {
561 read_bytes_count := cr.fill_buffer(i)!
562 // println("${i:-12d} of ${cr.f_len:-12d} readed: ${read_bytes_count}")
563 mut p1 := p
564 mut i1 := 0
565 for i1 < read_bytes_count {
566 if *p1 == cr.end_line {
567 count++
568 }
569 p1++
570 i1++
571 }
572 i += read_bytes_count
573 }
574 }
575 if cr.mem_buf_type == file_csv {
576 cr.f.seek(cr.start_index, .start)!
577 }
578 // println("rows_count Done!")
579 return count
580}
581