From e99c0f40f94e1c6057a8b058a43a7bf63f786b5f Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Thu, 23 Apr 2026 17:42:33 +0300 Subject: [PATCH] x: fix It is recommended that vlang provide a data analysis tool library similar to Python Pandas (fixes #19355) --- vlib/x/dataframe/README.md | 42 +++ vlib/x/dataframe/dataframe.v | 585 ++++++++++++++++++++++++++++++ vlib/x/dataframe/dataframe_test.v | 67 ++++ 3 files changed, 694 insertions(+) create mode 100644 vlib/x/dataframe/README.md create mode 100644 vlib/x/dataframe/dataframe.v create mode 100644 vlib/x/dataframe/dataframe_test.v diff --git a/vlib/x/dataframe/README.md b/vlib/x/dataframe/README.md new file mode 100644 index 000000000..51136a8d2 --- /dev/null +++ b/vlib/x/dataframe/README.md @@ -0,0 +1,42 @@ +# DataFrame + +`x.dataframe` provides a small experimental tabular data API. It is intended as +a foundation for data analysis workflows that need DataFrame-style operations +without leaving V. + +The module stores cells as strings and provides numeric helpers on `Series` for +common analysis tasks. + +```v +import x.dataframe + +const prices = 'symbol,price,qty +AAPL,189.5,2 +MSFT,420.25,3 +' + +fn main() { + df := dataframe.from_csv(prices, dataframe.CsvConfig{})! + println(df.shape()) + + price := df.column('price')! + println(price.mean()!) + + liquid := df.filter(fn (row dataframe.Row) bool { + return row.values['qty'].int() >= 3 + }) + println(liquid.rows) +} +``` + +## Features + +- Load tabular data from CSV strings or files. +- Access cells, rows, and columns by name. +- Select columns and filter rows. +- Sort by string or numeric column values. +- Count distinct column values. +- Calculate `sum`, `mean`, `min`, `max`, `median`, `stddev`, and `describe` + summaries for numeric columns. + +The API is experimental and may change while the module is under `x`. diff --git a/vlib/x/dataframe/dataframe.v b/vlib/x/dataframe/dataframe.v new file mode 100644 index 000000000..02c7123ce --- /dev/null +++ b/vlib/x/dataframe/dataframe.v @@ -0,0 +1,585 @@ +module dataframe + +import encoding.csv +import math +import strconv + +// CsvConfig configures CSV loading for a DataFrame. +@[params] +pub struct CsvConfig { +pub: + has_header bool = true + separator u8 = `,` + comment u8 = `#` + default_cell string + empty_cell string + end_line_len int = csv.endline_cr_len + quote u8 = `"` + quote_remove bool +} + +// SortOrder controls the direction used by sorting helpers. +pub enum SortOrder { + asc + desc +} + +// Summary contains basic numeric statistics for a Series. +pub struct Summary { +pub: + count int + sum f64 + mean f64 + min f64 + max f64 + median f64 + stddev f64 +} + +// Series is a single named string column with numeric helpers. +pub struct Series { +pub: + name string + values []string +} + +// Row is a named view of a DataFrame row. +pub struct Row { +pub: + values map[string]string +} + +// DataFrame stores rectangular tabular data as string cells. +pub struct DataFrame { + index map[string]int +pub: + columns []string + rows [][]string +} + +// new creates a DataFrame from column names and rows. +pub fn new(columns []string, rows [][]string) !DataFrame { + index := build_index(columns)! + mut copied_rows := [][]string{cap: rows.len} + for row_index, row in rows { + if row.len != columns.len { + return error('row ${row_index} has ${row.len} values, expected ${columns.len}') + } + copied_rows << row.clone() + } + return DataFrame{ + index: index + columns: columns.clone() + rows: copied_rows + } +} + +// empty creates an empty DataFrame with the given columns. +pub fn empty(columns []string) !DataFrame { + return new(columns, [][]string{})! +} + +// from_series creates a DataFrame from named columns. +pub fn from_series(series []Series) !DataFrame { + if series.len == 0 { + return error('at least one series is required') + } + row_count := series[0].values.len + mut columns := []string{cap: series.len} + for s in series { + if s.values.len != row_count { + return error('series `${s.name}` has ${s.values.len} values, expected ${row_count}') + } + columns << s.name + } + mut rows := [][]string{cap: row_count} + for row_index in 0 .. row_count { + mut row := []string{cap: series.len} + for s in series { + row << s.values[row_index] + } + rows << row + } + return new(columns, rows)! +} + +// from_columns creates a DataFrame from a map of columns. +pub fn from_columns(columns map[string][]string) !DataFrame { + if columns.len == 0 { + return error('at least one column is required') + } + mut names := columns.keys() + names.sort() + mut series := []Series{cap: names.len} + for name in names { + series << Series{ + name: name + values: columns[name].clone() + } + } + return from_series(series)! +} + +// from_csv creates a DataFrame from CSV text. +pub fn from_csv(text string, cfg CsvConfig) !DataFrame { + if text.len == 0 { + return error('csv input is empty') + } + mut reader := csv.csv_reader( + scr_buf: text.str + scr_buf_len: text.len + separator: cfg.separator + comment: cfg.comment + default_cell: cfg.default_cell + empty_cell: cfg.empty_cell + end_line_len: cfg.end_line_len + quote: cfg.quote + quote_remove: cfg.quote_remove + )! + defer { + reader.dispose_csv_reader() + } + return from_csv_reader(mut reader, cfg)! +} + +// read_csv creates a DataFrame from a CSV file. +pub fn read_csv(path string, cfg CsvConfig) !DataFrame { + mut reader := csv.csv_reader( + file_path: path + separator: cfg.separator + comment: cfg.comment + default_cell: cfg.default_cell + empty_cell: cfg.empty_cell + end_line_len: cfg.end_line_len + quote: cfg.quote + quote_remove: cfg.quote_remove + )! + defer { + reader.dispose_csv_reader() + } + return from_csv_reader(mut reader, cfg)! +} + +fn from_csv_reader(mut reader csv.RandomAccessReader, cfg CsvConfig) !DataFrame { + if reader.csv_map.len == 0 { + return error('csv input has no rows') + } + first_row := reader.get_row(0)! + if first_row.len == 0 { + return error('csv input has no columns') + } + columns := if cfg.has_header { + normalize_columns(first_row) + } else { + default_columns(first_row.len) + } + start_row := if cfg.has_header { 1 } else { 0 } + mut rows := [][]string{cap: math.max(0, reader.csv_map.len - start_row)} + for row_index in start_row .. reader.csv_map.len { + row := reader.get_row(row_index)! + rows << normalize_row(row, columns.len, cfg.empty_cell) + } + return new(columns, rows)! +} + +// height returns the number of rows. +pub fn (df DataFrame) height() int { + return df.rows.len +} + +// width returns the number of columns. +pub fn (df DataFrame) width() int { + return df.columns.len +} + +// shape returns the row and column count. +pub fn (df DataFrame) shape() (int, int) { + return df.rows.len, df.columns.len +} + +// cell returns a single cell by row index and column name. +pub fn (df DataFrame) cell(row_index int, column string) !string { + if row_index < 0 || row_index >= df.rows.len { + return error('row index ${row_index} is out of range') + } + column_index := df.column_index(column)! + return df.rows[row_index][column_index] +} + +// row returns a named row by index. +pub fn (df DataFrame) row(row_index int) !Row { + if row_index < 0 || row_index >= df.rows.len { + return error('row index ${row_index} is out of range') + } + mut values := map[string]string{} + row := df.rows[row_index] + for column_index, column in df.columns { + values[column] = row[column_index] + } + return Row{ + values: values + } +} + +// column returns a Series by column name. +pub fn (df DataFrame) column(name string) !Series { + column_index := df.column_index(name)! + mut values := []string{cap: df.rows.len} + for row in df.rows { + values << row[column_index] + } + return Series{ + name: name + values: values + } +} + +// select returns a DataFrame with only the requested columns. +pub fn (df DataFrame) select(names []string) !DataFrame { + mut column_indices := []int{cap: names.len} + for name in names { + column_indices << df.column_index(name)! + } + mut rows := [][]string{cap: df.rows.len} + for row in df.rows { + mut selected := []string{cap: names.len} + for column_index in column_indices { + selected << row[column_index] + } + rows << selected + } + return new(names, rows)! +} + +// head returns the first n rows. +pub fn (df DataFrame) head(n int) DataFrame { + if n <= 0 { + return df.with_rows([][]string{}) + } + end := math.min(n, df.rows.len) + return df.with_rows(df.rows[..end]) +} + +// tail returns the last n rows. +pub fn (df DataFrame) tail(n int) DataFrame { + if n <= 0 { + return df.with_rows([][]string{}) + } + start := math.max(0, df.rows.len - n) + return df.with_rows(df.rows[start..]) +} + +// filter returns rows accepted by the predicate. +pub fn (df DataFrame) filter(predicate fn (Row) bool) DataFrame { + mut rows := [][]string{cap: df.rows.len} + for row_index in 0 .. df.rows.len { + row := df.row(row_index) or { continue } + if predicate(row) { + rows << df.rows[row_index].clone() + } + } + return df.with_rows(rows) +} + +// sort_by returns rows sorted lexicographically by column. +pub fn (df DataFrame) sort_by(name string, order SortOrder) !DataFrame { + column_index := df.column_index(name)! + return df.sorted(column_index, order, false)! +} + +// sort_by_f64 returns rows sorted numerically by column. +pub fn (df DataFrame) sort_by_f64(name string, order SortOrder) !DataFrame { + column_index := df.column_index(name)! + return df.sorted(column_index, order, true)! +} + +// value_counts counts unique values in a column. +pub fn (df DataFrame) value_counts(name string) !map[string]int { + column_index := df.column_index(name)! + mut counts := map[string]int{} + for row in df.rows { + value := row[column_index] + counts[value]++ + } + return counts +} + +// describe returns numeric statistics for a named column. +pub fn (df DataFrame) describe(name string) !Summary { + series := df.column(name)! + return series.describe()! +} + +fn (df DataFrame) sorted(column_index int, order SortOrder, numeric bool) !DataFrame { + mut rows := clone_rows(df.rows) + for i in 1 .. rows.len { + mut j := i + for j > 0 { + if !should_swap(rows[j - 1][column_index], rows[j][column_index], order, numeric)! { + break + } + rows[j - 1], rows[j] = rows[j], rows[j - 1] + j-- + } + } + return df.with_rows(rows) +} + +fn (df DataFrame) with_rows(rows [][]string) DataFrame { + return DataFrame{ + index: df.index.clone() + columns: df.columns.clone() + rows: clone_rows(rows) + } +} + +fn (df DataFrame) column_index(name string) !int { + if name !in df.index { + return error('unknown column `${name}`') + } + return df.index[name] +} + +// get returns a value from the Row by column name. +pub fn (row Row) get(name string) !string { + if name !in row.values { + return error('unknown column `${name}`') + } + return row.values[name] +} + +// len returns the number of values in the Series. +pub fn (s Series) len() int { + return s.values.len +} + +// get returns a value from the Series by row index. +pub fn (s Series) get(index int) !string { + if index < 0 || index >= s.values.len { + return error('series index ${index} is out of range') + } + return s.values[index] +} + +// f64s converts every value in the Series to f64. +pub fn (s Series) f64s() ![]f64 { + mut values := []f64{cap: s.values.len} + for index, raw_value in s.values { + values << parse_f64(raw_value, '${s.name}[${index}]')! + } + return values +} + +// sum returns the numeric sum of the Series. +pub fn (s Series) sum() !f64 { + values := s.f64s()! + mut total := 0.0 + for value in values { + total += value + } + return total +} + +// mean returns the numeric mean of the Series. +pub fn (s Series) mean() !f64 { + values := s.f64s()! + if values.len == 0 { + return error('series `${s.name}` is empty') + } + return sum_f64(values) / f64(values.len) +} + +// min returns the smallest numeric value in the Series. +pub fn (s Series) min() !f64 { + values := s.f64s()! + if values.len == 0 { + return error('series `${s.name}` is empty') + } + mut min_value := values[0] + for value in values[1..] { + if value < min_value { + min_value = value + } + } + return min_value +} + +// max returns the largest numeric value in the Series. +pub fn (s Series) max() !f64 { + values := s.f64s()! + if values.len == 0 { + return error('series `${s.name}` is empty') + } + mut max_value := values[0] + for value in values[1..] { + if value > max_value { + max_value = value + } + } + return max_value +} + +// median returns the numeric median of the Series. +pub fn (s Series) median() !f64 { + mut values := s.f64s()! + if values.len == 0 { + return error('series `${s.name}` is empty') + } + values.sort() + return median_f64(values) +} + +// stddev returns the population standard deviation of the Series. +pub fn (s Series) stddev() !f64 { + values := s.f64s()! + if values.len == 0 { + return error('series `${s.name}` is empty') + } + mean := sum_f64(values) / f64(values.len) + mut variance_sum := 0.0 + for value in values { + diff := value - mean + variance_sum += diff * diff + } + return math.sqrt(variance_sum / f64(values.len)) +} + +// describe returns basic numeric statistics for the Series. +pub fn (s Series) describe() !Summary { + mut values := s.f64s()! + if values.len == 0 { + return error('series `${s.name}` is empty') + } + values.sort() + sum := sum_f64(values) + mean := sum / f64(values.len) + mut variance_sum := 0.0 + for value in values { + diff := value - mean + variance_sum += diff * diff + } + return Summary{ + count: values.len + sum: sum + mean: mean + min: values.first() + max: values.last() + median: median_f64(values) + stddev: math.sqrt(variance_sum / f64(values.len)) + } +} + +fn build_index(columns []string) !map[string]int { + if columns.len == 0 { + return error('at least one column is required') + } + mut index := map[string]int{} + for column_index, column in columns { + name := column.trim_space() + if name.len == 0 { + return error('column ${column_index} is empty') + } + if name in index { + return error('duplicate column `${name}`') + } + index[name] = column_index + } + return index +} + +fn clone_rows(rows [][]string) [][]string { + mut copied_rows := [][]string{cap: rows.len} + for row in rows { + copied_rows << row.clone() + } + return copied_rows +} + +fn default_columns(count int) []string { + mut columns := []string{cap: count} + for index in 0 .. count { + columns << 'column_${index}' + } + return columns +} + +fn normalize_columns(columns []string) []string { + mut normalized := []string{cap: columns.len} + for column in columns { + normalized << column.trim_space() + } + return normalized +} + +fn normalize_row(row []string, column_count int, fill string) []string { + mut normalized := []string{cap: column_count} + for column_index in 0 .. column_count { + if column_index < row.len { + normalized << row[column_index] + } else { + normalized << fill + } + } + return normalized +} + +fn should_swap(left string, right string, order SortOrder, numeric bool) !bool { + comparison := if numeric { + compare_f64(left, right)! + } else { + compare_string(left, right) + } + return match order { + .asc { comparison > 0 } + .desc { comparison < 0 } + } +} + +fn compare_string(left string, right string) int { + if left < right { + return -1 + } + if left > right { + return 1 + } + return 0 +} + +fn compare_f64(left string, right string) !int { + left_number := parse_f64(left, 'left value')! + right_number := parse_f64(right, 'right value')! + if left_number < right_number { + return -1 + } + if left_number > right_number { + return 1 + } + return 0 +} + +fn parse_f64(value string, label string) !f64 { + trimmed_value := value.trim_space() + $if js { + return strconv.atof64(trimmed_value) or { error('${label} is not a number: `${value}`') } + } $else { + return strconv.atof64(trimmed_value, strconv.AtoF64Param{}) or { + error('${label} is not a number: `${value}`') + } + } +} + +fn sum_f64(values []f64) f64 { + mut total := 0.0 + for value in values { + total += value + } + return total +} + +fn median_f64(sorted_values []f64) f64 { + mid := sorted_values.len / 2 + if sorted_values.len % 2 == 1 { + return sorted_values[mid] + } + return (sorted_values[mid - 1] + sorted_values[mid]) / 2.0 +} diff --git a/vlib/x/dataframe/dataframe_test.v b/vlib/x/dataframe/dataframe_test.v new file mode 100644 index 000000000..3da3609d2 --- /dev/null +++ b/vlib/x/dataframe/dataframe_test.v @@ -0,0 +1,67 @@ +import math +import x.dataframe + +const prices_csv = 'symbol,price,qty +AAPL,189.5,2 +MSFT,420.25,3 +AAPL,191.0,5 +' + +fn test_from_csv_and_numeric_series() { + df := dataframe.from_csv(prices_csv, dataframe.CsvConfig{})! + rows, columns := df.shape() + assert rows == 3 + assert columns == 3 + assert df.columns == ['symbol', 'price', 'qty'] + assert df.cell(1, 'symbol')! == 'MSFT' + + prices := df.column('price')! + assert prices.len() == 3 + assert math.alike(prices.mean()!, 266.9166666666667) + assert math.alike(prices.min()!, 189.5) + assert math.alike(prices.max()!, 420.25) + assert math.alike(prices.median()!, 191.0) + + summary := df.describe('qty')! + assert summary.count == 3 + assert math.alike(summary.sum, 10.0) + assert math.alike(summary.mean, 3.3333333333333335) +} + +fn test_select_filter_sort_and_value_counts() { + df := dataframe.from_csv(prices_csv, dataframe.CsvConfig{})! + aapl := df.filter(fn (row dataframe.Row) bool { + return row.values['symbol'] == 'AAPL' + }) + assert aapl.height() == 2 + + selected := aapl.select(['symbol', 'qty'])! + assert selected.width() == 2 + assert selected.cell(1, 'qty')! == '5' + + sorted := df.sort_by_f64('price', .asc)! + assert sorted.cell(0, 'symbol')! == 'AAPL' + assert sorted.cell(2, 'symbol')! == 'MSFT' + + counts := df.value_counts('symbol')! + assert counts['AAPL'] == 2 + assert counts['MSFT'] == 1 +} + +fn test_from_columns_and_csv_without_header() { + df := dataframe.from_columns({ + 'b': ['2', '1'] + 'a': ['x', 'y'] + })! + assert df.columns == ['a', 'b'] + assert df.cell(0, 'a')! == 'x' + assert math.alike(df.column('b')!.median()!, 1.5) + + no_header := dataframe.from_csv('AAPL,189.5 +MSFT,420.25 +', dataframe.CsvConfig{ + has_header: false + })! + assert no_header.columns == ['column_0', 'column_1'] + assert no_header.cell(1, 'column_1')! == '420.25' +} -- 2.39.5