| 1 | // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved. |
| 2 | // Use of this source code is governed by an MIT license |
| 3 | // that can be found in the LICENSE file. |
| 4 | module csv |
| 5 | |
| 6 | // Once interfaces are further along the idea would be to have something similar to |
| 7 | // go's io.reader & bufio.reader rather than reading the whole file into string, this |
| 8 | // would then satisfy that interface. I designed it this way to be easily adapted. |
| 9 | struct CommentIsDelimiterError { |
| 10 | Error |
| 11 | } |
| 12 | |
| 13 | fn (err CommentIsDelimiterError) msg() string { |
| 14 | return 'encoding.csv: comment cannot be the same as delimiter' |
| 15 | } |
| 16 | |
| 17 | struct InvalidDelimiterError { |
| 18 | Error |
| 19 | } |
| 20 | |
| 21 | fn (err InvalidDelimiterError) msg() string { |
| 22 | return 'encoding.csv: invalid delimiter' |
| 23 | } |
| 24 | |
| 25 | struct EndOfFileError { |
| 26 | Error |
| 27 | } |
| 28 | |
| 29 | fn (err EndOfFileError) msg() string { |
| 30 | return 'encoding.csv: end of file' |
| 31 | } |
| 32 | |
| 33 | struct InvalidLineEndingError { |
| 34 | Error |
| 35 | } |
| 36 | |
| 37 | fn (err InvalidLineEndingError) msg() string { |
| 38 | return 'encoding.csv: could not find any valid line endings' |
| 39 | } |
| 40 | |
| 41 | struct Reader { |
| 42 | // not used yet |
| 43 | // has_header bool |
| 44 | // headings []string |
| 45 | data string |
| 46 | delimiter u8 |
| 47 | comment u8 |
| 48 | mut: |
| 49 | is_mac_pre_osx_le bool |
| 50 | row_pos int |
| 51 | } |
| 52 | |
| 53 | @[params] |
| 54 | pub struct ReaderConfig { |
| 55 | pub: |
| 56 | delimiter u8 = `,` |
| 57 | comment u8 = `#` |
| 58 | } |
| 59 | |
| 60 | // new_reader initializes a Reader with string data to parse and, |
| 61 | // optionally, a custom delimiter. |
| 62 | pub fn new_reader(data string, config ReaderConfig) &Reader { |
| 63 | return &Reader{ |
| 64 | data: data |
| 65 | delimiter: config.delimiter |
| 66 | comment: config.comment |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | // read reads a row from the CSV data. |
| 71 | // If successful, the result holds an array of each column's data. |
| 72 | pub fn (mut r Reader) read() ![]string { |
| 73 | l := r.read_record()! |
| 74 | return l |
| 75 | } |
| 76 | |
| 77 | // Once we have multi dimensional array |
| 78 | // pub fn (mut r Reader) read_all() ?[][]string { |
| 79 | // mut records := []string{} |
| 80 | // for { |
| 81 | // record := r.read_record() or { |
| 82 | // if err.error == err_eof.error { |
| 83 | // return records |
| 84 | // } else { |
| 85 | // return err |
| 86 | // } |
| 87 | // } |
| 88 | // records << record |
| 89 | // } |
| 90 | // return records |
| 91 | // } |
| 92 | fn (mut r Reader) read_line() !string { |
| 93 | // last record |
| 94 | if r.row_pos >= r.data.len { |
| 95 | return &EndOfFileError{} |
| 96 | } |
| 97 | le := if r.is_mac_pre_osx_le { '\r' } else { '\n' } |
| 98 | mut i := r.data.index_after(le, r.row_pos) or { -1 } |
| 99 | if i == -1 { |
| 100 | if r.row_pos == 0 { |
| 101 | // check for pre osx mac line endings |
| 102 | i = r.data.index_after('\r', r.row_pos) or { -1 } |
| 103 | if i != -1 { |
| 104 | r.is_mac_pre_osx_le = true |
| 105 | } else { |
| 106 | // no valid line endings found |
| 107 | return &InvalidLineEndingError{} |
| 108 | } |
| 109 | } else { |
| 110 | // No line ending on file |
| 111 | i = r.data.len |
| 112 | } |
| 113 | } |
| 114 | mut line := r.data[r.row_pos..i] |
| 115 | r.row_pos = i + 1 |
| 116 | // normalize win line endings (remove extra \r) |
| 117 | if !r.is_mac_pre_osx_le && (line.len >= 1 && line[line.len - 1] == `\r`) { |
| 118 | line = line[..line.len - 1] |
| 119 | } |
| 120 | return line |
| 121 | } |
| 122 | |
| 123 | fn (mut r Reader) read_record() ![]string { |
| 124 | if r.delimiter == r.comment { |
| 125 | return &CommentIsDelimiterError{} |
| 126 | } |
| 127 | if !valid_delim(r.delimiter) { |
| 128 | return &InvalidDelimiterError{} |
| 129 | } |
| 130 | mut need_read := true |
| 131 | mut keep_raw := false |
| 132 | mut line := '' |
| 133 | mut fields := []string{} |
| 134 | mut i := -1 |
| 135 | for { |
| 136 | if need_read { |
| 137 | l := r.read_line()! |
| 138 | if l.len <= 0 { |
| 139 | if keep_raw { |
| 140 | line += '\n' |
| 141 | } |
| 142 | continue |
| 143 | } else if l[0] == r.comment { |
| 144 | if keep_raw { |
| 145 | line += '\n' + l |
| 146 | } |
| 147 | continue |
| 148 | } else { |
| 149 | if keep_raw { |
| 150 | line += '\n' |
| 151 | } |
| 152 | line += l |
| 153 | } |
| 154 | need_read = false |
| 155 | keep_raw = false |
| 156 | } |
| 157 | if line.len == 0 || line[0] != `"` { // not quoted |
| 158 | j := line.index(r.delimiter.ascii_str()) or { |
| 159 | // last |
| 160 | fields << line[..line.len] |
| 161 | break |
| 162 | } |
| 163 | i = j |
| 164 | fields << line[..i] |
| 165 | line = line[i + 1..] |
| 166 | continue |
| 167 | } else { // quoted |
| 168 | mut need_more := true |
| 169 | mut has_double_quotes := false |
| 170 | mut j := 0 |
| 171 | mut n := 1 |
| 172 | for n < line.len { |
| 173 | if line[n] == `"` { |
| 174 | if n == line.len - 1 || line[n + 1] != `"` { |
| 175 | need_more = false |
| 176 | j = n - 1 |
| 177 | break |
| 178 | } else { |
| 179 | has_double_quotes = true |
| 180 | n++ |
| 181 | } |
| 182 | } |
| 183 | n++ |
| 184 | } |
| 185 | if need_more { |
| 186 | need_read = true |
| 187 | keep_raw = true |
| 188 | continue |
| 189 | } |
| 190 | line = line[1..] |
| 191 | if j + 1 == line.len { |
| 192 | // last record |
| 193 | fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] } |
| 194 | break |
| 195 | } |
| 196 | next := line[j + 1] |
| 197 | if next == r.delimiter { |
| 198 | fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] } |
| 199 | if j + 2 == line.len { |
| 200 | line = '' |
| 201 | } else { |
| 202 | line = line[j + 2..] |
| 203 | } |
| 204 | continue |
| 205 | } |
| 206 | } |
| 207 | if i <= -1 && fields.len == 0 { |
| 208 | return &InvalidDelimiterError{} |
| 209 | } |
| 210 | } |
| 211 | return fields |
| 212 | } |
| 213 | |
| 214 | fn valid_delim(b u8) bool { |
| 215 | return b != 0 && b != `"` && b != `\r` && b != `\n` |
| 216 | } |
| 217 | |