| 1 | module dataframe |
| 2 | |
| 3 | import encoding.csv |
| 4 | import math |
| 5 | import strconv |
| 6 | |
| 7 | // CsvConfig configures CSV loading for a DataFrame. |
| 8 | @[params] |
| 9 | pub struct CsvConfig { |
| 10 | pub: |
| 11 | has_header bool = true |
| 12 | separator u8 = `,` |
| 13 | comment u8 = `#` |
| 14 | default_cell string |
| 15 | empty_cell string |
| 16 | end_line_len int = csv.endline_cr_len |
| 17 | quote u8 = `"` |
| 18 | quote_remove bool |
| 19 | } |
| 20 | |
| 21 | // SortOrder controls the direction used by sorting helpers. |
| 22 | pub enum SortOrder { |
| 23 | asc |
| 24 | desc |
| 25 | } |
| 26 | |
| 27 | // Summary contains basic numeric statistics for a Series. |
| 28 | pub struct Summary { |
| 29 | pub: |
| 30 | count int |
| 31 | sum f64 |
| 32 | mean f64 |
| 33 | min f64 |
| 34 | max f64 |
| 35 | median f64 |
| 36 | stddev f64 |
| 37 | } |
| 38 | |
| 39 | // Series is a single named string column with numeric helpers. |
| 40 | pub struct Series { |
| 41 | pub: |
| 42 | name string |
| 43 | values []string |
| 44 | } |
| 45 | |
| 46 | // Row is a named view of a DataFrame row. |
| 47 | pub struct Row { |
| 48 | pub: |
| 49 | values map[string]string |
| 50 | } |
| 51 | |
| 52 | // DataFrame stores rectangular tabular data as string cells. |
| 53 | pub struct DataFrame { |
| 54 | index map[string]int |
| 55 | pub: |
| 56 | columns []string |
| 57 | rows [][]string |
| 58 | } |
| 59 | |
| 60 | // new creates a DataFrame from column names and rows. |
| 61 | pub fn new(columns []string, rows [][]string) !DataFrame { |
| 62 | index := build_index(columns)! |
| 63 | mut copied_rows := [][]string{cap: rows.len} |
| 64 | for row_index, row in rows { |
| 65 | if row.len != columns.len { |
| 66 | return error('row ${row_index} has ${row.len} values, expected ${columns.len}') |
| 67 | } |
| 68 | copied_rows << row.clone() |
| 69 | } |
| 70 | return DataFrame{ |
| 71 | index: index |
| 72 | columns: columns.clone() |
| 73 | rows: copied_rows |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | // empty creates an empty DataFrame with the given columns. |
| 78 | pub fn empty(columns []string) !DataFrame { |
| 79 | return new(columns, [][]string{})! |
| 80 | } |
| 81 | |
| 82 | // from_series creates a DataFrame from named columns. |
| 83 | pub fn from_series(series []Series) !DataFrame { |
| 84 | if series.len == 0 { |
| 85 | return error('at least one series is required') |
| 86 | } |
| 87 | row_count := series[0].values.len |
| 88 | mut columns := []string{cap: series.len} |
| 89 | for s in series { |
| 90 | if s.values.len != row_count { |
| 91 | return error('series `${s.name}` has ${s.values.len} values, expected ${row_count}') |
| 92 | } |
| 93 | columns << s.name |
| 94 | } |
| 95 | mut rows := [][]string{cap: row_count} |
| 96 | for row_index in 0 .. row_count { |
| 97 | mut row := []string{cap: series.len} |
| 98 | for s in series { |
| 99 | row << s.values[row_index] |
| 100 | } |
| 101 | rows << row |
| 102 | } |
| 103 | return new(columns, rows)! |
| 104 | } |
| 105 | |
| 106 | // from_columns creates a DataFrame from a map of columns. |
| 107 | pub fn from_columns(columns map[string][]string) !DataFrame { |
| 108 | if columns.len == 0 { |
| 109 | return error('at least one column is required') |
| 110 | } |
| 111 | mut names := columns.keys() |
| 112 | names.sort() |
| 113 | mut series := []Series{cap: names.len} |
| 114 | for name in names { |
| 115 | series << Series{ |
| 116 | name: name |
| 117 | values: columns[name].clone() |
| 118 | } |
| 119 | } |
| 120 | return from_series(series)! |
| 121 | } |
| 122 | |
| 123 | // from_csv creates a DataFrame from CSV text. |
| 124 | pub fn from_csv(text string, cfg CsvConfig) !DataFrame { |
| 125 | if text.len == 0 { |
| 126 | return error('csv input is empty') |
| 127 | } |
| 128 | mut reader := csv.csv_reader( |
| 129 | scr_buf: text.str |
| 130 | scr_buf_len: text.len |
| 131 | separator: cfg.separator |
| 132 | comment: cfg.comment |
| 133 | default_cell: cfg.default_cell |
| 134 | empty_cell: cfg.empty_cell |
| 135 | end_line_len: cfg.end_line_len |
| 136 | quote: cfg.quote |
| 137 | quote_remove: cfg.quote_remove |
| 138 | )! |
| 139 | defer { |
| 140 | reader.dispose_csv_reader() |
| 141 | } |
| 142 | return from_csv_reader(mut reader, cfg)! |
| 143 | } |
| 144 | |
| 145 | // read_csv creates a DataFrame from a CSV file. |
| 146 | pub fn read_csv(path string, cfg CsvConfig) !DataFrame { |
| 147 | mut reader := csv.csv_reader( |
| 148 | file_path: path |
| 149 | separator: cfg.separator |
| 150 | comment: cfg.comment |
| 151 | default_cell: cfg.default_cell |
| 152 | empty_cell: cfg.empty_cell |
| 153 | end_line_len: cfg.end_line_len |
| 154 | quote: cfg.quote |
| 155 | quote_remove: cfg.quote_remove |
| 156 | )! |
| 157 | defer { |
| 158 | reader.dispose_csv_reader() |
| 159 | } |
| 160 | return from_csv_reader(mut reader, cfg)! |
| 161 | } |
| 162 | |
| 163 | fn from_csv_reader(mut reader csv.RandomAccessReader, cfg CsvConfig) !DataFrame { |
| 164 | if reader.csv_map.len == 0 { |
| 165 | return error('csv input has no rows') |
| 166 | } |
| 167 | first_row := reader.get_row(0)! |
| 168 | if first_row.len == 0 { |
| 169 | return error('csv input has no columns') |
| 170 | } |
| 171 | columns := if cfg.has_header { |
| 172 | normalize_columns(first_row) |
| 173 | } else { |
| 174 | default_columns(first_row.len) |
| 175 | } |
| 176 | start_row := if cfg.has_header { 1 } else { 0 } |
| 177 | mut rows := [][]string{cap: math.max(0, reader.csv_map.len - start_row)} |
| 178 | for row_index in start_row .. reader.csv_map.len { |
| 179 | row := reader.get_row(row_index)! |
| 180 | rows << normalize_row(row, columns.len, cfg.empty_cell) |
| 181 | } |
| 182 | return new(columns, rows)! |
| 183 | } |
| 184 | |
| 185 | // height returns the number of rows. |
| 186 | pub fn (df DataFrame) height() int { |
| 187 | return df.rows.len |
| 188 | } |
| 189 | |
| 190 | // width returns the number of columns. |
| 191 | pub fn (df DataFrame) width() int { |
| 192 | return df.columns.len |
| 193 | } |
| 194 | |
| 195 | // shape returns the row and column count. |
| 196 | pub fn (df DataFrame) shape() (int, int) { |
| 197 | return df.rows.len, df.columns.len |
| 198 | } |
| 199 | |
| 200 | // cell returns a single cell by row index and column name. |
| 201 | pub fn (df DataFrame) cell(row_index int, column string) !string { |
| 202 | if row_index < 0 || row_index >= df.rows.len { |
| 203 | return error('row index ${row_index} is out of range') |
| 204 | } |
| 205 | column_index := df.column_index(column)! |
| 206 | return df.rows[row_index][column_index] |
| 207 | } |
| 208 | |
| 209 | // row returns a named row by index. |
| 210 | pub fn (df DataFrame) row(row_index int) !Row { |
| 211 | if row_index < 0 || row_index >= df.rows.len { |
| 212 | return error('row index ${row_index} is out of range') |
| 213 | } |
| 214 | mut values := map[string]string{} |
| 215 | row := df.rows[row_index] |
| 216 | for column_index, column in df.columns { |
| 217 | values[column] = row[column_index] |
| 218 | } |
| 219 | return Row{ |
| 220 | values: values |
| 221 | } |
| 222 | } |
| 223 | |
| 224 | // column returns a Series by column name. |
| 225 | pub fn (df DataFrame) column(name string) !Series { |
| 226 | column_index := df.column_index(name)! |
| 227 | mut values := []string{cap: df.rows.len} |
| 228 | for row in df.rows { |
| 229 | values << row[column_index] |
| 230 | } |
| 231 | return Series{ |
| 232 | name: name |
| 233 | values: values |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | // select returns a DataFrame with only the requested columns. |
| 238 | pub fn (df DataFrame) select(names []string) !DataFrame { |
| 239 | mut column_indices := []int{cap: names.len} |
| 240 | for name in names { |
| 241 | column_indices << df.column_index(name)! |
| 242 | } |
| 243 | mut rows := [][]string{cap: df.rows.len} |
| 244 | for row in df.rows { |
| 245 | mut selected := []string{cap: names.len} |
| 246 | for column_index in column_indices { |
| 247 | selected << row[column_index] |
| 248 | } |
| 249 | rows << selected |
| 250 | } |
| 251 | return new(names, rows)! |
| 252 | } |
| 253 | |
| 254 | // head returns the first n rows. |
| 255 | pub fn (df DataFrame) head(n int) DataFrame { |
| 256 | if n <= 0 { |
| 257 | return df.with_rows([][]string{}) |
| 258 | } |
| 259 | end := math.min(n, df.rows.len) |
| 260 | return df.with_rows(df.rows[..end]) |
| 261 | } |
| 262 | |
| 263 | // tail returns the last n rows. |
| 264 | pub fn (df DataFrame) tail(n int) DataFrame { |
| 265 | if n <= 0 { |
| 266 | return df.with_rows([][]string{}) |
| 267 | } |
| 268 | start := math.max(0, df.rows.len - n) |
| 269 | return df.with_rows(df.rows[start..]) |
| 270 | } |
| 271 | |
| 272 | // filter returns rows accepted by the predicate. |
| 273 | pub fn (df DataFrame) filter(predicate fn (Row) bool) DataFrame { |
| 274 | mut rows := [][]string{cap: df.rows.len} |
| 275 | for row_index in 0 .. df.rows.len { |
| 276 | row := df.row(row_index) or { continue } |
| 277 | if predicate(row) { |
| 278 | rows << df.rows[row_index].clone() |
| 279 | } |
| 280 | } |
| 281 | return df.with_rows(rows) |
| 282 | } |
| 283 | |
| 284 | // sort_by returns rows sorted lexicographically by column. |
| 285 | pub fn (df DataFrame) sort_by(name string, order SortOrder) !DataFrame { |
| 286 | column_index := df.column_index(name)! |
| 287 | return df.sorted(column_index, order, false)! |
| 288 | } |
| 289 | |
| 290 | // sort_by_f64 returns rows sorted numerically by column. |
| 291 | pub fn (df DataFrame) sort_by_f64(name string, order SortOrder) !DataFrame { |
| 292 | column_index := df.column_index(name)! |
| 293 | return df.sorted(column_index, order, true)! |
| 294 | } |
| 295 | |
| 296 | // value_counts counts unique values in a column. |
| 297 | pub fn (df DataFrame) value_counts(name string) !map[string]int { |
| 298 | column_index := df.column_index(name)! |
| 299 | mut counts := map[string]int{} |
| 300 | for row in df.rows { |
| 301 | value := row[column_index] |
| 302 | counts[value]++ |
| 303 | } |
| 304 | return counts |
| 305 | } |
| 306 | |
| 307 | // describe returns numeric statistics for a named column. |
| 308 | pub fn (df DataFrame) describe(name string) !Summary { |
| 309 | series := df.column(name)! |
| 310 | return series.describe()! |
| 311 | } |
| 312 | |
| 313 | fn (df DataFrame) sorted(column_index int, order SortOrder, numeric bool) !DataFrame { |
| 314 | mut rows := clone_rows(df.rows) |
| 315 | for i in 1 .. rows.len { |
| 316 | mut j := i |
| 317 | for j > 0 { |
| 318 | if !should_swap(rows[j - 1][column_index], rows[j][column_index], order, numeric)! { |
| 319 | break |
| 320 | } |
| 321 | rows[j - 1], rows[j] = rows[j], rows[j - 1] |
| 322 | j-- |
| 323 | } |
| 324 | } |
| 325 | return df.with_rows(rows) |
| 326 | } |
| 327 | |
| 328 | fn (df DataFrame) with_rows(rows [][]string) DataFrame { |
| 329 | return DataFrame{ |
| 330 | index: df.index.clone() |
| 331 | columns: df.columns.clone() |
| 332 | rows: clone_rows(rows) |
| 333 | } |
| 334 | } |
| 335 | |
| 336 | fn (df DataFrame) column_index(name string) !int { |
| 337 | if name !in df.index { |
| 338 | return error('unknown column `${name}`') |
| 339 | } |
| 340 | return df.index[name] |
| 341 | } |
| 342 | |
| 343 | // get returns a value from the Row by column name. |
| 344 | pub fn (row Row) get(name string) !string { |
| 345 | if name !in row.values { |
| 346 | return error('unknown column `${name}`') |
| 347 | } |
| 348 | return row.values[name] |
| 349 | } |
| 350 | |
| 351 | // len returns the number of values in the Series. |
| 352 | pub fn (s Series) len() int { |
| 353 | return s.values.len |
| 354 | } |
| 355 | |
| 356 | // get returns a value from the Series by row index. |
| 357 | pub fn (s Series) get(index int) !string { |
| 358 | if index < 0 || index >= s.values.len { |
| 359 | return error('series index ${index} is out of range') |
| 360 | } |
| 361 | return s.values[index] |
| 362 | } |
| 363 | |
| 364 | // f64s converts every value in the Series to f64. |
| 365 | pub fn (s Series) f64s() ![]f64 { |
| 366 | mut values := []f64{cap: s.values.len} |
| 367 | for index, raw_value in s.values { |
| 368 | values << parse_f64(raw_value, '${s.name}[${index}]')! |
| 369 | } |
| 370 | return values |
| 371 | } |
| 372 | |
| 373 | // sum returns the numeric sum of the Series. |
| 374 | pub fn (s Series) sum() !f64 { |
| 375 | values := s.f64s()! |
| 376 | mut total := 0.0 |
| 377 | for value in values { |
| 378 | total += value |
| 379 | } |
| 380 | return total |
| 381 | } |
| 382 | |
| 383 | // mean returns the numeric mean of the Series. |
| 384 | pub fn (s Series) mean() !f64 { |
| 385 | values := s.f64s()! |
| 386 | if values.len == 0 { |
| 387 | return error('series `${s.name}` is empty') |
| 388 | } |
| 389 | return sum_f64(values) / f64(values.len) |
| 390 | } |
| 391 | |
| 392 | // min returns the smallest numeric value in the Series. |
| 393 | pub fn (s Series) min() !f64 { |
| 394 | values := s.f64s()! |
| 395 | if values.len == 0 { |
| 396 | return error('series `${s.name}` is empty') |
| 397 | } |
| 398 | mut min_value := values[0] |
| 399 | for value in values[1..] { |
| 400 | if value < min_value { |
| 401 | min_value = value |
| 402 | } |
| 403 | } |
| 404 | return min_value |
| 405 | } |
| 406 | |
| 407 | // max returns the largest numeric value in the Series. |
| 408 | pub fn (s Series) max() !f64 { |
| 409 | values := s.f64s()! |
| 410 | if values.len == 0 { |
| 411 | return error('series `${s.name}` is empty') |
| 412 | } |
| 413 | mut max_value := values[0] |
| 414 | for value in values[1..] { |
| 415 | if value > max_value { |
| 416 | max_value = value |
| 417 | } |
| 418 | } |
| 419 | return max_value |
| 420 | } |
| 421 | |
| 422 | // median returns the numeric median of the Series. |
| 423 | pub fn (s Series) median() !f64 { |
| 424 | mut values := s.f64s()! |
| 425 | if values.len == 0 { |
| 426 | return error('series `${s.name}` is empty') |
| 427 | } |
| 428 | values.sort() |
| 429 | return median_f64(values) |
| 430 | } |
| 431 | |
| 432 | // stddev returns the population standard deviation of the Series. |
| 433 | pub fn (s Series) stddev() !f64 { |
| 434 | values := s.f64s()! |
| 435 | if values.len == 0 { |
| 436 | return error('series `${s.name}` is empty') |
| 437 | } |
| 438 | mean := sum_f64(values) / f64(values.len) |
| 439 | mut variance_sum := 0.0 |
| 440 | for value in values { |
| 441 | diff := value - mean |
| 442 | variance_sum += diff * diff |
| 443 | } |
| 444 | return math.sqrt(variance_sum / f64(values.len)) |
| 445 | } |
| 446 | |
| 447 | // describe returns basic numeric statistics for the Series. |
| 448 | pub fn (s Series) describe() !Summary { |
| 449 | mut values := s.f64s()! |
| 450 | if values.len == 0 { |
| 451 | return error('series `${s.name}` is empty') |
| 452 | } |
| 453 | values.sort() |
| 454 | sum := sum_f64(values) |
| 455 | mean := sum / f64(values.len) |
| 456 | mut variance_sum := 0.0 |
| 457 | for value in values { |
| 458 | diff := value - mean |
| 459 | variance_sum += diff * diff |
| 460 | } |
| 461 | return Summary{ |
| 462 | count: values.len |
| 463 | sum: sum |
| 464 | mean: mean |
| 465 | min: values.first() |
| 466 | max: values.last() |
| 467 | median: median_f64(values) |
| 468 | stddev: math.sqrt(variance_sum / f64(values.len)) |
| 469 | } |
| 470 | } |
| 471 | |
| 472 | fn build_index(columns []string) !map[string]int { |
| 473 | if columns.len == 0 { |
| 474 | return error('at least one column is required') |
| 475 | } |
| 476 | mut index := map[string]int{} |
| 477 | for column_index, column in columns { |
| 478 | name := column.trim_space() |
| 479 | if name.len == 0 { |
| 480 | return error('column ${column_index} is empty') |
| 481 | } |
| 482 | if name in index { |
| 483 | return error('duplicate column `${name}`') |
| 484 | } |
| 485 | index[name] = column_index |
| 486 | } |
| 487 | return index |
| 488 | } |
| 489 | |
| 490 | fn clone_rows(rows [][]string) [][]string { |
| 491 | mut copied_rows := [][]string{cap: rows.len} |
| 492 | for row in rows { |
| 493 | copied_rows << row.clone() |
| 494 | } |
| 495 | return copied_rows |
| 496 | } |
| 497 | |
| 498 | fn default_columns(count int) []string { |
| 499 | mut columns := []string{cap: count} |
| 500 | for index in 0 .. count { |
| 501 | columns << 'column_${index}' |
| 502 | } |
| 503 | return columns |
| 504 | } |
| 505 | |
| 506 | fn normalize_columns(columns []string) []string { |
| 507 | mut normalized := []string{cap: columns.len} |
| 508 | for column in columns { |
| 509 | normalized << column.trim_space() |
| 510 | } |
| 511 | return normalized |
| 512 | } |
| 513 | |
| 514 | fn normalize_row(row []string, column_count int, fill string) []string { |
| 515 | mut normalized := []string{cap: column_count} |
| 516 | for column_index in 0 .. column_count { |
| 517 | if column_index < row.len { |
| 518 | normalized << row[column_index] |
| 519 | } else { |
| 520 | normalized << fill |
| 521 | } |
| 522 | } |
| 523 | return normalized |
| 524 | } |
| 525 | |
| 526 | fn should_swap(left string, right string, order SortOrder, numeric bool) !bool { |
| 527 | comparison := if numeric { |
| 528 | compare_f64(left, right)! |
| 529 | } else { |
| 530 | compare_string(left, right) |
| 531 | } |
| 532 | return match order { |
| 533 | .asc { comparison > 0 } |
| 534 | .desc { comparison < 0 } |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | fn compare_string(left string, right string) int { |
| 539 | if left < right { |
| 540 | return -1 |
| 541 | } |
| 542 | if left > right { |
| 543 | return 1 |
| 544 | } |
| 545 | return 0 |
| 546 | } |
| 547 | |
| 548 | fn compare_f64(left string, right string) !int { |
| 549 | left_number := parse_f64(left, 'left value')! |
| 550 | right_number := parse_f64(right, 'right value')! |
| 551 | if left_number < right_number { |
| 552 | return -1 |
| 553 | } |
| 554 | if left_number > right_number { |
| 555 | return 1 |
| 556 | } |
| 557 | return 0 |
| 558 | } |
| 559 | |
| 560 | fn parse_f64(value string, label string) !f64 { |
| 561 | trimmed_value := value.trim_space() |
| 562 | $if js { |
| 563 | return strconv.atof64(trimmed_value) or { error('${label} is not a number: `${value}`') } |
| 564 | } $else { |
| 565 | return strconv.atof64(trimmed_value, strconv.AtoF64Param{}) or { |
| 566 | error('${label} is not a number: `${value}`') |
| 567 | } |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | fn sum_f64(values []f64) f64 { |
| 572 | mut total := 0.0 |
| 573 | for value in values { |
| 574 | total += value |
| 575 | } |
| 576 | return total |
| 577 | } |
| 578 | |
| 579 | fn median_f64(sorted_values []f64) f64 { |
| 580 | mid := sorted_values.len / 2 |
| 581 | if sorted_values.len % 2 == 1 { |
| 582 | return sorted_values[mid] |
| 583 | } |
| 584 | return (sorted_values[mid - 1] + sorted_values[mid]) / 2.0 |
| 585 | } |
| 586 | |