| 1 | /* |
| 2 | csv serial reader 1.0 alpha |
| 3 | |
| 4 | Copyright (c) 2023 Dario Deledda. All rights reserved. |
| 5 | Use of this source code is governed by an MIT license |
| 6 | that can be found in the LICENSE file. |
| 7 | |
| 8 | Known limitations: |
| 9 | */ |
| 10 | module csv |
| 11 | |
| 12 | import os |
| 13 | |
| 14 | @[params] |
| 15 | pub struct SequentialReaderConfig { |
| 16 | pub: |
| 17 | scr_buf voidptr // pointer to the buffer of data |
| 18 | scr_buf_len i64 // if > 0 use the RAM pointed by scr_buf as source of data |
| 19 | file_path string |
| 20 | start_index i64 |
| 21 | end_index i64 = -1 |
| 22 | mem_buf_size int = 1024 * 64 // default buffer size 64KByte |
| 23 | separator u8 = `,` |
| 24 | comment u8 = `#` // every line that start with the comment char is ignored |
| 25 | default_cell string = '*' // return this string if out of the csv boundaries |
| 26 | empty_cell string // return this string if empty cell |
| 27 | end_line_len int = endline_cr_len // size of the endline rune |
| 28 | quote u8 = `"` // double quote is the standard quote char |
| 29 | } |
| 30 | |
| 31 | pub struct SequentialReader { |
| 32 | pub mut: |
| 33 | index i64 |
| 34 | |
| 35 | f os.File |
| 36 | f_len i64 |
| 37 | is_bom_present bool |
| 38 | |
| 39 | start_index i64 |
| 40 | end_index i64 = -1 |
| 41 | |
| 42 | end_line u8 = `\n` |
| 43 | end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2 |
| 44 | separator u8 = `,` // comma is the default separator |
| 45 | separator_len int = 1 // size of the separator rune |
| 46 | quote u8 = `"` // double quote is the standard quote char |
| 47 | |
| 48 | comment u8 = `#` // every line that start with the quote char is ignored |
| 49 | |
| 50 | default_cell string = '*' // return this string if out of the csv boundaries |
| 51 | empty_cell string = '#' // retunrn this if empty cell |
| 52 | // ram buffer |
| 53 | mem_buf_type u32 // buffer type 0=File,1=RAM |
| 54 | mem_buf voidptr // buffer used to load chars from file |
| 55 | mem_buf_size i64 // size of the buffer |
| 56 | mem_buf_start i64 = -1 // start index in the file of the read buffer |
| 57 | mem_buf_end i64 = -1 // end index in the file of the read buffer |
| 58 | |
| 59 | ch_buf []u8 = []u8{cap: 1024} |
| 60 | // error management |
| 61 | row_count i64 |
| 62 | col_count i64 |
| 63 | } |
| 64 | |
| 65 | // csv_sequential_reader creates a sequential csv reader |
| 66 | pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader { |
| 67 | mut cr := &SequentialReader{} |
| 68 | |
| 69 | cr.start_index = cfg.start_index |
| 70 | cr.end_index = cfg.end_index |
| 71 | |
| 72 | // reading from a RAM buffer |
| 73 | if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 { |
| 74 | cr.mem_buf_type = ram_csv // RAM buffer |
| 75 | cr.mem_buf = cfg.scr_buf |
| 76 | cr.mem_buf_size = cfg.scr_buf_len |
| 77 | if cfg.end_index == -1 { |
| 78 | cr.end_index = cfg.scr_buf_len |
| 79 | } |
| 80 | |
| 81 | // check if BOM header is in the memory buffer |
| 82 | unsafe { |
| 83 | if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB |
| 84 | && *(&u8(cr.mem_buf) + 2) == 0xBF { |
| 85 | cr.is_bom_present = true |
| 86 | cr.index += 3 // skip the BOM |
| 87 | cr.start_index += 3 // skip the BOM |
| 88 | } |
| 89 | } |
| 90 | cr.mem_buf_start = 0 |
| 91 | cr.mem_buf_end = cr.mem_buf_size |
| 92 | |
| 93 | // check if is a file source |
| 94 | } else if cfg.file_path.len > 0 { |
| 95 | if !os.exists(cfg.file_path) { |
| 96 | return error('ERROR: file ${cfg.file_path} not found!') |
| 97 | } |
| 98 | cr.mem_buf_type = file_csv // File buffer |
| 99 | // allocate the memory |
| 100 | unsafe { |
| 101 | cr.mem_buf = malloc(cfg.mem_buf_size) |
| 102 | cr.mem_buf_size = cfg.mem_buf_size |
| 103 | } |
| 104 | cr.f = os.open_file(cfg.file_path, 'rb')! |
| 105 | |
| 106 | cr.f.seek(0, .end)! |
| 107 | cr.f_len = cr.f.tell()! |
| 108 | |
| 109 | cr.f.seek(cfg.start_index, .start)! |
| 110 | cr.index = cr.f.tell()! |
| 111 | |
| 112 | if cfg.end_index == -1 { |
| 113 | cr.end_index = cr.f_len |
| 114 | } |
| 115 | |
| 116 | // check if BOM header is in the file |
| 117 | if cr.index == 0 { |
| 118 | if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 { |
| 119 | unsafe { |
| 120 | if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB |
| 121 | && *(&u8(cr.mem_buf) + 2) == 0xBF { |
| 122 | cr.is_bom_present = true |
| 123 | cr.index += 3 // skip the BOM |
| 124 | cr.start_index += 3 // skip the BOM |
| 125 | } |
| 126 | } |
| 127 | } |
| 128 | cr.f.seek(cfg.start_index, .start)! |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | cr.default_cell = cfg.default_cell |
| 133 | cr.empty_cell = cfg.empty_cell |
| 134 | cr.end_line_len = cfg.end_line_len |
| 135 | cr.separator = cfg.separator |
| 136 | cr.comment = cfg.comment |
| 137 | cr.quote = cfg.quote |
| 138 | |
| 139 | return cr |
| 140 | } |
| 141 | |
| 142 | // dispose_csv_reader release the resources used by the csv_reader |
| 143 | pub fn (mut cr SequentialReader) dispose_csv_reader() { |
| 144 | if cr.mem_buf_type == ram_csv { |
| 145 | // do nothing, ram buffer is static |
| 146 | } else if cr.mem_buf_type == file_csv { |
| 147 | // file close |
| 148 | if cr.f.is_opened { |
| 149 | cr.f.close() |
| 150 | } |
| 151 | |
| 152 | // free the allocated memory |
| 153 | if cr.mem_buf_size > 0 { |
| 154 | unsafe { |
| 155 | free(cr.mem_buf) |
| 156 | } |
| 157 | cr.mem_buf = unsafe { nil } |
| 158 | cr.mem_buf_size = 0 |
| 159 | } |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | // has_data return the bytes available for future readings |
| 164 | pub fn (mut cr SequentialReader) has_data() i64 { |
| 165 | return cr.end_index - cr.start_index |
| 166 | } |
| 167 | |
| 168 | fn (mut cr SequentialReader) fill_buffer(index i64) ! { |
| 169 | if cr.mem_buf_type == ram_csv { |
| 170 | // for now do nothing if ram buffer |
| 171 | } else { |
| 172 | cr.f.seek(index, .start)! |
| 173 | // IMPORTANT: add 64 bit support in vlib!! |
| 174 | read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))! |
| 175 | cr.mem_buf_start = index |
| 176 | cr.mem_buf_end = index + read_bytes_count |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | enum SequentialReadingState as u16 { |
| 181 | comment |
| 182 | quote |
| 183 | after_quote |
| 184 | cell |
| 185 | newline |
| 186 | } |
| 187 | |
| 188 | // get_next_row get the next row from the CSV file as a string array |
| 189 | pub fn (mut cr SequentialReader) get_next_row() ![]string { |
| 190 | mut row_res := []string{} |
| 191 | // clear the cell buffer |
| 192 | cr.ch_buf.clear() |
| 193 | mut i := cr.start_index |
| 194 | mut state := SequentialReadingState.cell |
| 195 | |
| 196 | p := &u8(cr.mem_buf) |
| 197 | for i < cr.end_index { |
| 198 | if i < cr.mem_buf_start || i >= cr.mem_buf_end { |
| 199 | cr.fill_buffer(i)! |
| 200 | } |
| 201 | unsafe { |
| 202 | ch := *(p + i - cr.mem_buf_start) |
| 203 | |
| 204 | if state == .cell { |
| 205 | if ch == cr.separator { |
| 206 | // must be optimized |
| 207 | cr.ch_buf << 0 |
| 208 | row_res << if (cr.ch_buf.len - 1) == 0 { |
| 209 | cr.empty_cell |
| 210 | } else { |
| 211 | (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone()) |
| 212 | } |
| 213 | cr.ch_buf.clear() |
| 214 | } else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 { |
| 215 | state = .comment |
| 216 | } else if ch == cr.quote { |
| 217 | state = .quote |
| 218 | cr.ch_buf.clear() |
| 219 | cr.col_count++ |
| 220 | i++ |
| 221 | continue |
| 222 | } else if ch == cr.end_line { |
| 223 | cr.row_count++ |
| 224 | cr.col_count = 0 |
| 225 | |
| 226 | // skip empty rows |
| 227 | if !(row_res.len == 0 && cr.ch_buf.len < 1) { |
| 228 | cr.ch_buf << 0 |
| 229 | row_res << if (cr.ch_buf.len - 1) == 0 { |
| 230 | cr.empty_cell |
| 231 | } else { |
| 232 | (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone()) |
| 233 | } |
| 234 | i += cr.end_line_len - 1 |
| 235 | break |
| 236 | } |
| 237 | } else if ch == `\r` && cr.end_line_len == 2 { |
| 238 | // skip CR |
| 239 | } else { // normal char inside a cell |
| 240 | cr.ch_buf << ch |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | if state == .comment { |
| 245 | if cr.ch_buf.len > 0 { |
| 246 | // must be optimized |
| 247 | cr.ch_buf << 0 |
| 248 | row_res << if (cr.ch_buf.len - 1) == 0 { |
| 249 | cr.empty_cell |
| 250 | } else { |
| 251 | (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone()) |
| 252 | } |
| 253 | cr.ch_buf.clear() |
| 254 | } else if ch == cr.end_line { |
| 255 | state = .cell |
| 256 | } |
| 257 | } |
| 258 | |
| 259 | if state == .quote { |
| 260 | if ch == cr.quote { |
| 261 | // must be optimized |
| 262 | cr.ch_buf << 0 |
| 263 | row_res << if (cr.ch_buf.len - 1) == 0 { |
| 264 | cr.empty_cell |
| 265 | } else { |
| 266 | (tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone()) |
| 267 | } |
| 268 | cr.ch_buf.clear() |
| 269 | |
| 270 | state = .after_quote |
| 271 | cr.col_count++ |
| 272 | i++ |
| 273 | continue |
| 274 | } else if ch == cr.end_line { |
| 275 | return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!') |
| 276 | } else { // normal char inside a quote inside a cell |
| 277 | cr.ch_buf << ch |
| 278 | } |
| 279 | } |
| 280 | |
| 281 | if state == .after_quote { |
| 282 | if ch == cr.separator { |
| 283 | state = .cell |
| 284 | } else if ch == cr.end_line { |
| 285 | cr.row_count++ |
| 286 | cr.col_count = 0 |
| 287 | cr.ch_buf.clear() |
| 288 | i += cr.end_line_len - 1 |
| 289 | break |
| 290 | } |
| 291 | } |
| 292 | } |
| 293 | cr.col_count++ |
| 294 | i++ |
| 295 | } |
| 296 | cr.start_index = i |
| 297 | return row_res |
| 298 | } |
| 299 | |