v2 / vlib / x / dataframe / dataframe.v
585 lines · 532 sloc · 13.57 KB · e99c0f40f94e1c6057a8b058a43a7bf63f786b5f
Raw
1module dataframe
2
3import encoding.csv
4import math
5import strconv
6
7// CsvConfig configures CSV loading for a DataFrame.
8@[params]
9pub struct CsvConfig {
10pub:
11 has_header bool = true
12 separator u8 = `,`
13 comment u8 = `#`
14 default_cell string
15 empty_cell string
16 end_line_len int = csv.endline_cr_len
17 quote u8 = `"`
18 quote_remove bool
19}
20
21// SortOrder controls the direction used by sorting helpers.
22pub enum SortOrder {
23 asc
24 desc
25}
26
27// Summary contains basic numeric statistics for a Series.
28pub struct Summary {
29pub:
30 count int
31 sum f64
32 mean f64
33 min f64
34 max f64
35 median f64
36 stddev f64
37}
38
39// Series is a single named string column with numeric helpers.
40pub struct Series {
41pub:
42 name string
43 values []string
44}
45
46// Row is a named view of a DataFrame row.
47pub struct Row {
48pub:
49 values map[string]string
50}
51
52// DataFrame stores rectangular tabular data as string cells.
53pub struct DataFrame {
54 index map[string]int
55pub:
56 columns []string
57 rows [][]string
58}
59
60// new creates a DataFrame from column names and rows.
61pub fn new(columns []string, rows [][]string) !DataFrame {
62 index := build_index(columns)!
63 mut copied_rows := [][]string{cap: rows.len}
64 for row_index, row in rows {
65 if row.len != columns.len {
66 return error('row ${row_index} has ${row.len} values, expected ${columns.len}')
67 }
68 copied_rows << row.clone()
69 }
70 return DataFrame{
71 index: index
72 columns: columns.clone()
73 rows: copied_rows
74 }
75}
76
77// empty creates an empty DataFrame with the given columns.
78pub fn empty(columns []string) !DataFrame {
79 return new(columns, [][]string{})!
80}
81
82// from_series creates a DataFrame from named columns.
83pub fn from_series(series []Series) !DataFrame {
84 if series.len == 0 {
85 return error('at least one series is required')
86 }
87 row_count := series[0].values.len
88 mut columns := []string{cap: series.len}
89 for s in series {
90 if s.values.len != row_count {
91 return error('series `${s.name}` has ${s.values.len} values, expected ${row_count}')
92 }
93 columns << s.name
94 }
95 mut rows := [][]string{cap: row_count}
96 for row_index in 0 .. row_count {
97 mut row := []string{cap: series.len}
98 for s in series {
99 row << s.values[row_index]
100 }
101 rows << row
102 }
103 return new(columns, rows)!
104}
105
106// from_columns creates a DataFrame from a map of columns.
107pub fn from_columns(columns map[string][]string) !DataFrame {
108 if columns.len == 0 {
109 return error('at least one column is required')
110 }
111 mut names := columns.keys()
112 names.sort()
113 mut series := []Series{cap: names.len}
114 for name in names {
115 series << Series{
116 name: name
117 values: columns[name].clone()
118 }
119 }
120 return from_series(series)!
121}
122
123// from_csv creates a DataFrame from CSV text.
124pub fn from_csv(text string, cfg CsvConfig) !DataFrame {
125 if text.len == 0 {
126 return error('csv input is empty')
127 }
128 mut reader := csv.csv_reader(
129 scr_buf: text.str
130 scr_buf_len: text.len
131 separator: cfg.separator
132 comment: cfg.comment
133 default_cell: cfg.default_cell
134 empty_cell: cfg.empty_cell
135 end_line_len: cfg.end_line_len
136 quote: cfg.quote
137 quote_remove: cfg.quote_remove
138 )!
139 defer {
140 reader.dispose_csv_reader()
141 }
142 return from_csv_reader(mut reader, cfg)!
143}
144
145// read_csv creates a DataFrame from a CSV file.
146pub fn read_csv(path string, cfg CsvConfig) !DataFrame {
147 mut reader := csv.csv_reader(
148 file_path: path
149 separator: cfg.separator
150 comment: cfg.comment
151 default_cell: cfg.default_cell
152 empty_cell: cfg.empty_cell
153 end_line_len: cfg.end_line_len
154 quote: cfg.quote
155 quote_remove: cfg.quote_remove
156 )!
157 defer {
158 reader.dispose_csv_reader()
159 }
160 return from_csv_reader(mut reader, cfg)!
161}
162
163fn from_csv_reader(mut reader csv.RandomAccessReader, cfg CsvConfig) !DataFrame {
164 if reader.csv_map.len == 0 {
165 return error('csv input has no rows')
166 }
167 first_row := reader.get_row(0)!
168 if first_row.len == 0 {
169 return error('csv input has no columns')
170 }
171 columns := if cfg.has_header {
172 normalize_columns(first_row)
173 } else {
174 default_columns(first_row.len)
175 }
176 start_row := if cfg.has_header { 1 } else { 0 }
177 mut rows := [][]string{cap: math.max(0, reader.csv_map.len - start_row)}
178 for row_index in start_row .. reader.csv_map.len {
179 row := reader.get_row(row_index)!
180 rows << normalize_row(row, columns.len, cfg.empty_cell)
181 }
182 return new(columns, rows)!
183}
184
185// height returns the number of rows.
186pub fn (df DataFrame) height() int {
187 return df.rows.len
188}
189
190// width returns the number of columns.
191pub fn (df DataFrame) width() int {
192 return df.columns.len
193}
194
195// shape returns the row and column count.
196pub fn (df DataFrame) shape() (int, int) {
197 return df.rows.len, df.columns.len
198}
199
200// cell returns a single cell by row index and column name.
201pub fn (df DataFrame) cell(row_index int, column string) !string {
202 if row_index < 0 || row_index >= df.rows.len {
203 return error('row index ${row_index} is out of range')
204 }
205 column_index := df.column_index(column)!
206 return df.rows[row_index][column_index]
207}
208
209// row returns a named row by index.
210pub fn (df DataFrame) row(row_index int) !Row {
211 if row_index < 0 || row_index >= df.rows.len {
212 return error('row index ${row_index} is out of range')
213 }
214 mut values := map[string]string{}
215 row := df.rows[row_index]
216 for column_index, column in df.columns {
217 values[column] = row[column_index]
218 }
219 return Row{
220 values: values
221 }
222}
223
224// column returns a Series by column name.
225pub fn (df DataFrame) column(name string) !Series {
226 column_index := df.column_index(name)!
227 mut values := []string{cap: df.rows.len}
228 for row in df.rows {
229 values << row[column_index]
230 }
231 return Series{
232 name: name
233 values: values
234 }
235}
236
237// select returns a DataFrame with only the requested columns.
238pub fn (df DataFrame) select(names []string) !DataFrame {
239 mut column_indices := []int{cap: names.len}
240 for name in names {
241 column_indices << df.column_index(name)!
242 }
243 mut rows := [][]string{cap: df.rows.len}
244 for row in df.rows {
245 mut selected := []string{cap: names.len}
246 for column_index in column_indices {
247 selected << row[column_index]
248 }
249 rows << selected
250 }
251 return new(names, rows)!
252}
253
254// head returns the first n rows.
255pub fn (df DataFrame) head(n int) DataFrame {
256 if n <= 0 {
257 return df.with_rows([][]string{})
258 }
259 end := math.min(n, df.rows.len)
260 return df.with_rows(df.rows[..end])
261}
262
263// tail returns the last n rows.
264pub fn (df DataFrame) tail(n int) DataFrame {
265 if n <= 0 {
266 return df.with_rows([][]string{})
267 }
268 start := math.max(0, df.rows.len - n)
269 return df.with_rows(df.rows[start..])
270}
271
272// filter returns rows accepted by the predicate.
273pub fn (df DataFrame) filter(predicate fn (Row) bool) DataFrame {
274 mut rows := [][]string{cap: df.rows.len}
275 for row_index in 0 .. df.rows.len {
276 row := df.row(row_index) or { continue }
277 if predicate(row) {
278 rows << df.rows[row_index].clone()
279 }
280 }
281 return df.with_rows(rows)
282}
283
284// sort_by returns rows sorted lexicographically by column.
285pub fn (df DataFrame) sort_by(name string, order SortOrder) !DataFrame {
286 column_index := df.column_index(name)!
287 return df.sorted(column_index, order, false)!
288}
289
290// sort_by_f64 returns rows sorted numerically by column.
291pub fn (df DataFrame) sort_by_f64(name string, order SortOrder) !DataFrame {
292 column_index := df.column_index(name)!
293 return df.sorted(column_index, order, true)!
294}
295
296// value_counts counts unique values in a column.
297pub fn (df DataFrame) value_counts(name string) !map[string]int {
298 column_index := df.column_index(name)!
299 mut counts := map[string]int{}
300 for row in df.rows {
301 value := row[column_index]
302 counts[value]++
303 }
304 return counts
305}
306
307// describe returns numeric statistics for a named column.
308pub fn (df DataFrame) describe(name string) !Summary {
309 series := df.column(name)!
310 return series.describe()!
311}
312
313fn (df DataFrame) sorted(column_index int, order SortOrder, numeric bool) !DataFrame {
314 mut rows := clone_rows(df.rows)
315 for i in 1 .. rows.len {
316 mut j := i
317 for j > 0 {
318 if !should_swap(rows[j - 1][column_index], rows[j][column_index], order, numeric)! {
319 break
320 }
321 rows[j - 1], rows[j] = rows[j], rows[j - 1]
322 j--
323 }
324 }
325 return df.with_rows(rows)
326}
327
328fn (df DataFrame) with_rows(rows [][]string) DataFrame {
329 return DataFrame{
330 index: df.index.clone()
331 columns: df.columns.clone()
332 rows: clone_rows(rows)
333 }
334}
335
336fn (df DataFrame) column_index(name string) !int {
337 if name !in df.index {
338 return error('unknown column `${name}`')
339 }
340 return df.index[name]
341}
342
343// get returns a value from the Row by column name.
344pub fn (row Row) get(name string) !string {
345 if name !in row.values {
346 return error('unknown column `${name}`')
347 }
348 return row.values[name]
349}
350
351// len returns the number of values in the Series.
352pub fn (s Series) len() int {
353 return s.values.len
354}
355
356// get returns a value from the Series by row index.
357pub fn (s Series) get(index int) !string {
358 if index < 0 || index >= s.values.len {
359 return error('series index ${index} is out of range')
360 }
361 return s.values[index]
362}
363
364// f64s converts every value in the Series to f64.
365pub fn (s Series) f64s() ![]f64 {
366 mut values := []f64{cap: s.values.len}
367 for index, raw_value in s.values {
368 values << parse_f64(raw_value, '${s.name}[${index}]')!
369 }
370 return values
371}
372
373// sum returns the numeric sum of the Series.
374pub fn (s Series) sum() !f64 {
375 values := s.f64s()!
376 mut total := 0.0
377 for value in values {
378 total += value
379 }
380 return total
381}
382
383// mean returns the numeric mean of the Series.
384pub fn (s Series) mean() !f64 {
385 values := s.f64s()!
386 if values.len == 0 {
387 return error('series `${s.name}` is empty')
388 }
389 return sum_f64(values) / f64(values.len)
390}
391
392// min returns the smallest numeric value in the Series.
393pub fn (s Series) min() !f64 {
394 values := s.f64s()!
395 if values.len == 0 {
396 return error('series `${s.name}` is empty')
397 }
398 mut min_value := values[0]
399 for value in values[1..] {
400 if value < min_value {
401 min_value = value
402 }
403 }
404 return min_value
405}
406
407// max returns the largest numeric value in the Series.
408pub fn (s Series) max() !f64 {
409 values := s.f64s()!
410 if values.len == 0 {
411 return error('series `${s.name}` is empty')
412 }
413 mut max_value := values[0]
414 for value in values[1..] {
415 if value > max_value {
416 max_value = value
417 }
418 }
419 return max_value
420}
421
422// median returns the numeric median of the Series.
423pub fn (s Series) median() !f64 {
424 mut values := s.f64s()!
425 if values.len == 0 {
426 return error('series `${s.name}` is empty')
427 }
428 values.sort()
429 return median_f64(values)
430}
431
432// stddev returns the population standard deviation of the Series.
433pub fn (s Series) stddev() !f64 {
434 values := s.f64s()!
435 if values.len == 0 {
436 return error('series `${s.name}` is empty')
437 }
438 mean := sum_f64(values) / f64(values.len)
439 mut variance_sum := 0.0
440 for value in values {
441 diff := value - mean
442 variance_sum += diff * diff
443 }
444 return math.sqrt(variance_sum / f64(values.len))
445}
446
447// describe returns basic numeric statistics for the Series.
448pub fn (s Series) describe() !Summary {
449 mut values := s.f64s()!
450 if values.len == 0 {
451 return error('series `${s.name}` is empty')
452 }
453 values.sort()
454 sum := sum_f64(values)
455 mean := sum / f64(values.len)
456 mut variance_sum := 0.0
457 for value in values {
458 diff := value - mean
459 variance_sum += diff * diff
460 }
461 return Summary{
462 count: values.len
463 sum: sum
464 mean: mean
465 min: values.first()
466 max: values.last()
467 median: median_f64(values)
468 stddev: math.sqrt(variance_sum / f64(values.len))
469 }
470}
471
472fn build_index(columns []string) !map[string]int {
473 if columns.len == 0 {
474 return error('at least one column is required')
475 }
476 mut index := map[string]int{}
477 for column_index, column in columns {
478 name := column.trim_space()
479 if name.len == 0 {
480 return error('column ${column_index} is empty')
481 }
482 if name in index {
483 return error('duplicate column `${name}`')
484 }
485 index[name] = column_index
486 }
487 return index
488}
489
490fn clone_rows(rows [][]string) [][]string {
491 mut copied_rows := [][]string{cap: rows.len}
492 for row in rows {
493 copied_rows << row.clone()
494 }
495 return copied_rows
496}
497
498fn default_columns(count int) []string {
499 mut columns := []string{cap: count}
500 for index in 0 .. count {
501 columns << 'column_${index}'
502 }
503 return columns
504}
505
506fn normalize_columns(columns []string) []string {
507 mut normalized := []string{cap: columns.len}
508 for column in columns {
509 normalized << column.trim_space()
510 }
511 return normalized
512}
513
514fn normalize_row(row []string, column_count int, fill string) []string {
515 mut normalized := []string{cap: column_count}
516 for column_index in 0 .. column_count {
517 if column_index < row.len {
518 normalized << row[column_index]
519 } else {
520 normalized << fill
521 }
522 }
523 return normalized
524}
525
526fn should_swap(left string, right string, order SortOrder, numeric bool) !bool {
527 comparison := if numeric {
528 compare_f64(left, right)!
529 } else {
530 compare_string(left, right)
531 }
532 return match order {
533 .asc { comparison > 0 }
534 .desc { comparison < 0 }
535 }
536}
537
538fn compare_string(left string, right string) int {
539 if left < right {
540 return -1
541 }
542 if left > right {
543 return 1
544 }
545 return 0
546}
547
548fn compare_f64(left string, right string) !int {
549 left_number := parse_f64(left, 'left value')!
550 right_number := parse_f64(right, 'right value')!
551 if left_number < right_number {
552 return -1
553 }
554 if left_number > right_number {
555 return 1
556 }
557 return 0
558}
559
560fn parse_f64(value string, label string) !f64 {
561 trimmed_value := value.trim_space()
562 $if js {
563 return strconv.atof64(trimmed_value) or { error('${label} is not a number: `${value}`') }
564 } $else {
565 return strconv.atof64(trimmed_value, strconv.AtoF64Param{}) or {
566 error('${label} is not a number: `${value}`')
567 }
568 }
569}
570
571fn sum_f64(values []f64) f64 {
572 mut total := 0.0
573 for value in values {
574 total += value
575 }
576 return total
577}
578
579fn median_f64(sorted_values []f64) f64 {
580 mid := sorted_values.len / 2
581 if sorted_values.len % 2 == 1 {
582 return sorted_values[mid]
583 }
584 return (sorted_values[mid - 1] + sorted_values[mid]) / 2.0
585}
586