v / vlib / encoding / csv / csv_reader_sequential.v
298 lines · 263 sloc · 7.46 KB · ddb6685d8a0cb498b5031c644f16d05ac3121ced
Raw
1/*
2csv serial reader 1.0 alpha
3
4Copyright (c) 2023 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8Known limitations:
9*/
10module csv
11
12import os
13
14@[params]
15pub struct SequentialReaderConfig {
16pub:
17 scr_buf voidptr // pointer to the buffer of data
18 scr_buf_len i64 // if > 0 use the RAM pointed by scr_buf as source of data
19 file_path string
20 start_index i64
21 end_index i64 = -1
22 mem_buf_size int = 1024 * 64 // default buffer size 64KByte
23 separator u8 = `,`
24 comment u8 = `#` // every line that start with the comment char is ignored
25 default_cell string = '*' // return this string if out of the csv boundaries
26 empty_cell string // return this string if empty cell
27 end_line_len int = endline_cr_len // size of the endline rune
28 quote u8 = `"` // double quote is the standard quote char
29}
30
31pub struct SequentialReader {
32pub mut:
33 index i64
34
35 f os.File
36 f_len i64
37 is_bom_present bool
38
39 start_index i64
40 end_index i64 = -1
41
42 end_line u8 = `\n`
43 end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
44 separator u8 = `,` // comma is the default separator
45 separator_len int = 1 // size of the separator rune
46 quote u8 = `"` // double quote is the standard quote char
47
48 comment u8 = `#` // every line that start with the quote char is ignored
49
50 default_cell string = '*' // return this string if out of the csv boundaries
51 empty_cell string = '#' // retunrn this if empty cell
52 // ram buffer
53 mem_buf_type u32 // buffer type 0=File,1=RAM
54 mem_buf voidptr // buffer used to load chars from file
55 mem_buf_size i64 // size of the buffer
56 mem_buf_start i64 = -1 // start index in the file of the read buffer
57 mem_buf_end i64 = -1 // end index in the file of the read buffer
58
59 ch_buf []u8 = []u8{cap: 1024}
60 // error management
61 row_count i64
62 col_count i64
63}
64
65// csv_sequential_reader creates a sequential csv reader
66pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
67 mut cr := &SequentialReader{}
68
69 cr.start_index = cfg.start_index
70 cr.end_index = cfg.end_index
71
72 // reading from a RAM buffer
73 if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
74 cr.mem_buf_type = ram_csv // RAM buffer
75 cr.mem_buf = cfg.scr_buf
76 cr.mem_buf_size = cfg.scr_buf_len
77 if cfg.end_index == -1 {
78 cr.end_index = cfg.scr_buf_len
79 }
80
81 // check if BOM header is in the memory buffer
82 unsafe {
83 if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
84 && *(&u8(cr.mem_buf) + 2) == 0xBF {
85 cr.is_bom_present = true
86 cr.index += 3 // skip the BOM
87 cr.start_index += 3 // skip the BOM
88 }
89 }
90 cr.mem_buf_start = 0
91 cr.mem_buf_end = cr.mem_buf_size
92
93 // check if is a file source
94 } else if cfg.file_path.len > 0 {
95 if !os.exists(cfg.file_path) {
96 return error('ERROR: file ${cfg.file_path} not found!')
97 }
98 cr.mem_buf_type = file_csv // File buffer
99 // allocate the memory
100 unsafe {
101 cr.mem_buf = malloc(cfg.mem_buf_size)
102 cr.mem_buf_size = cfg.mem_buf_size
103 }
104 cr.f = os.open_file(cfg.file_path, 'rb')!
105
106 cr.f.seek(0, .end)!
107 cr.f_len = cr.f.tell()!
108
109 cr.f.seek(cfg.start_index, .start)!
110 cr.index = cr.f.tell()!
111
112 if cfg.end_index == -1 {
113 cr.end_index = cr.f_len
114 }
115
116 // check if BOM header is in the file
117 if cr.index == 0 {
118 if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
119 unsafe {
120 if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
121 && *(&u8(cr.mem_buf) + 2) == 0xBF {
122 cr.is_bom_present = true
123 cr.index += 3 // skip the BOM
124 cr.start_index += 3 // skip the BOM
125 }
126 }
127 }
128 cr.f.seek(cfg.start_index, .start)!
129 }
130 }
131
132 cr.default_cell = cfg.default_cell
133 cr.empty_cell = cfg.empty_cell
134 cr.end_line_len = cfg.end_line_len
135 cr.separator = cfg.separator
136 cr.comment = cfg.comment
137 cr.quote = cfg.quote
138
139 return cr
140}
141
142// dispose_csv_reader release the resources used by the csv_reader
143pub fn (mut cr SequentialReader) dispose_csv_reader() {
144 if cr.mem_buf_type == ram_csv {
145 // do nothing, ram buffer is static
146 } else if cr.mem_buf_type == file_csv {
147 // file close
148 if cr.f.is_opened {
149 cr.f.close()
150 }
151
152 // free the allocated memory
153 if cr.mem_buf_size > 0 {
154 unsafe {
155 free(cr.mem_buf)
156 }
157 cr.mem_buf = unsafe { nil }
158 cr.mem_buf_size = 0
159 }
160 }
161}
162
163// has_data return the bytes available for future readings
164pub fn (mut cr SequentialReader) has_data() i64 {
165 return cr.end_index - cr.start_index
166}
167
168fn (mut cr SequentialReader) fill_buffer(index i64) ! {
169 if cr.mem_buf_type == ram_csv {
170 // for now do nothing if ram buffer
171 } else {
172 cr.f.seek(index, .start)!
173 // IMPORTANT: add 64 bit support in vlib!!
174 read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
175 cr.mem_buf_start = index
176 cr.mem_buf_end = index + read_bytes_count
177 }
178}
179
180enum SequentialReadingState as u16 {
181 comment
182 quote
183 after_quote
184 cell
185 newline
186}
187
188// get_next_row get the next row from the CSV file as a string array
189pub fn (mut cr SequentialReader) get_next_row() ![]string {
190 mut row_res := []string{}
191 // clear the cell buffer
192 cr.ch_buf.clear()
193 mut i := cr.start_index
194 mut state := SequentialReadingState.cell
195
196 p := &u8(cr.mem_buf)
197 for i < cr.end_index {
198 if i < cr.mem_buf_start || i >= cr.mem_buf_end {
199 cr.fill_buffer(i)!
200 }
201 unsafe {
202 ch := *(p + i - cr.mem_buf_start)
203
204 if state == .cell {
205 if ch == cr.separator {
206 // must be optimized
207 cr.ch_buf << 0
208 row_res << if (cr.ch_buf.len - 1) == 0 {
209 cr.empty_cell
210 } else {
211 (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
212 }
213 cr.ch_buf.clear()
214 } else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
215 state = .comment
216 } else if ch == cr.quote {
217 state = .quote
218 cr.ch_buf.clear()
219 cr.col_count++
220 i++
221 continue
222 } else if ch == cr.end_line {
223 cr.row_count++
224 cr.col_count = 0
225
226 // skip empty rows
227 if !(row_res.len == 0 && cr.ch_buf.len < 1) {
228 cr.ch_buf << 0
229 row_res << if (cr.ch_buf.len - 1) == 0 {
230 cr.empty_cell
231 } else {
232 (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
233 }
234 i += cr.end_line_len - 1
235 break
236 }
237 } else if ch == `\r` && cr.end_line_len == 2 {
238 // skip CR
239 } else { // normal char inside a cell
240 cr.ch_buf << ch
241 }
242 }
243
244 if state == .comment {
245 if cr.ch_buf.len > 0 {
246 // must be optimized
247 cr.ch_buf << 0
248 row_res << if (cr.ch_buf.len - 1) == 0 {
249 cr.empty_cell
250 } else {
251 (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
252 }
253 cr.ch_buf.clear()
254 } else if ch == cr.end_line {
255 state = .cell
256 }
257 }
258
259 if state == .quote {
260 if ch == cr.quote {
261 // must be optimized
262 cr.ch_buf << 0
263 row_res << if (cr.ch_buf.len - 1) == 0 {
264 cr.empty_cell
265 } else {
266 (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
267 }
268 cr.ch_buf.clear()
269
270 state = .after_quote
271 cr.col_count++
272 i++
273 continue
274 } else if ch == cr.end_line {
275 return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
276 } else { // normal char inside a quote inside a cell
277 cr.ch_buf << ch
278 }
279 }
280
281 if state == .after_quote {
282 if ch == cr.separator {
283 state = .cell
284 } else if ch == cr.end_line {
285 cr.row_count++
286 cr.col_count = 0
287 cr.ch_buf.clear()
288 i += cr.end_line_len - 1
289 break
290 }
291 }
292 }
293 cr.col_count++
294 i++
295 }
296 cr.start_index = i
297 return row_res
298}
299