From 01c9d247393f6213c95d6ff115157e0dc6c47bd5 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:30:00 -0500 Subject: [PATCH 1/3] feat: an even smarter `pivotp`, featuring auto-aggregation selection based on type/stats --- src/cmd/pivotp.rs | 189 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 169 insertions(+), 20 deletions(-) diff --git a/src/cmd/pivotp.rs b/src/cmd/pivotp.rs index 472fe632d..a4c048fba 100644 --- a/src/cmd/pivotp.rs +++ b/src/cmd/pivotp.rs @@ -5,7 +5,9 @@ The pivot operation consists of: - One or more index columns (these will be the new rows) - A column that will be pivoted (this will create the new columns) - A values column that will be aggregated -- An aggregation function to apply +- An aggregation function to apply. Features "smart" aggregation auto-selection. + +For examples, see https://github.com/dathere/qsv/blob/master/tests/test_pivotp.rs. Usage: qsv pivotp [options] @@ -36,7 +38,12 @@ pivotp options: median - Median value count - Count of values last - Last value encountered - [default: count] + none - No aggregation is done. Raises error if multiple values are in group. + smart - use value column statistics to pick an aggregation. + Will only work if there is one value column, otherwise + it falls back to `first` + smartq - same as smart, but no messages. + [default: smart] --sort-columns Sort the transposed columns by name. Default is by order of discovery. --col-separator The separator in generated column names in case of multiple --values columns. [default: _] @@ -44,7 +51,8 @@ pivotp options: --try-parsedates When set, will attempt to parse columns as dates. --infer-len Number of rows to scan when inferring schema. Set to 0 to scan entire file. [default: 10000] - --decimal-comma Use comma as decimal separator. + --decimal-comma Use comma as decimal separator when READING the input. + Note that you will need to specify an alternate --delimiter. --ignore-errors Skip rows that can't be parsed. Common options: @@ -54,20 +62,24 @@ Common options: Must be a single character. (default: ,) "#; -use std::{fs::File, io, io::Write, path::Path}; +use std::{fs::File, io, io::Write, path::Path, sync::OnceLock}; +use csv::ByteRecord; use indicatif::HumanCount; use polars::prelude::*; use polars_ops::pivot::{pivot_stable, PivotAgg}; use serde::Deserialize; use crate::{ - config::Delimiter, + cmd::stats::StatsData, + config::{Config, Delimiter}, util, util::{get_stats_records, StatsMode}, CliResult, }; +static STATS_RECORDS: OnceLock<(ByteRecord, Vec)> = OnceLock::new(); + #[derive(Deserialize)] struct Args { arg_on_cols: String, @@ -115,11 +127,10 @@ fn calculate_pivot_metadata( flag_memcheck: false, }; - let Ok((csv_fields, csv_stats)) = + let (csv_fields, csv_stats) = STATS_RECORDS.get_or_init(|| { get_stats_records(&schema_args, StatsMode::FrequencyForceStats) - else { - return Ok(None); - }; + .unwrap_or_else(|_| (ByteRecord::new(), Vec::new())) + }); if csv_stats.is_empty() { return Ok(None); @@ -183,6 +194,113 @@ fn validate_pivot_operation(metadata: &PivotMetadata) -> CliResult<()> { Ok(()) } +/// Suggest an appropriate aggregation function based on column statistics +#[allow(clippy::cast_precision_loss)] +fn suggest_agg_function( + args: &Args, + value_cols: &[String], + quiet: bool, +) -> CliResult> { + let schema_args = util::SchemaArgs { + flag_enum_threshold: 0, + flag_ignore_case: false, + flag_strict_dates: false, + flag_pattern_columns: crate::select::SelectColumns::parse("").unwrap(), + flag_dates_whitelist: String::new(), + flag_prefer_dmy: false, + flag_force: false, + flag_stdout: false, + flag_jobs: None, + flag_no_headers: false, + flag_delimiter: args.flag_delimiter, + arg_input: Some(args.arg_input.clone()), + flag_memcheck: false, + }; + + let (csv_fields, csv_stats) = STATS_RECORDS.get_or_init(|| { + get_stats_records(&schema_args, StatsMode::FrequencyForceStats) + .unwrap_or_else(|_| (ByteRecord::new(), Vec::new())) + }); + + // If multiple value columns, default to First + if value_cols.len() > 1 { + return Ok(Some(PivotAgg::First)); + } + + // Get stats for the value column + let value_col = &value_cols[0]; + let field_pos = csv_fields + .iter() + .position(|f| std::str::from_utf8(f).unwrap_or("") == value_col); + + if let Some(pos) = field_pos { + let stats = &csv_stats[pos]; + let rconfig = Config::new(Some(&args.arg_input)); + let row_count = util::count_rows(&rconfig)? as u64; + + // Suggest aggregation based on field type and statistics + let suggested_agg = match stats.r#type.as_str() { + "NULL" => { + if !quiet { + eprintln!("Info: \"{value_col}\" contains only NULL values"); + } + PivotAgg::Count + }, + "Integer" | "Float" => { + if stats.nullcount as f64 / row_count as f64 > 0.5 { + if !quiet { + eprintln!("Info: \"{value_col}\" contains >50% NULL values, using Count"); + } + PivotAgg::Count + } else { + PivotAgg::Sum + } + }, + "Date" | "DateTime" => { + if stats.cardinality as f64 / row_count as f64 > 0.9 { + if !quiet { + eprintln!( + "Info: {} column \"{value_col}\" has high cardinality, using First", + stats.r#type + ); + } + PivotAgg::First + } else { + if !quiet { + eprintln!( + "Info: \"{value_col}\" is a {} column, using Count", + stats.r#type + ); + } + PivotAgg::Count + } + }, + _ => { + if stats.cardinality == row_count { + if !quiet { + eprintln!("Info: \"{value_col}\" contains all unique values, using First"); + } + PivotAgg::First + } else if stats.cardinality as f64 / row_count as f64 > 0.5 { + if !quiet { + eprintln!("Info: \"{value_col}\" has high cardinality, using Count"); + } + PivotAgg::Count + } else { + if !quiet { + eprintln!("Info: \"{value_col}\" is a String column, using Count"); + } + PivotAgg::Count + } + }, + }; + + Ok(Some(suggested_agg)) + } else { + Ok(None) + } +} + pub fn run(argv: &[&str]) -> CliResult<()> { let args: Args = util::get_args(USAGE, argv)?; @@ -226,17 +344,42 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // Get aggregation function let agg_fn = if let Some(ref agg) = args.flag_agg { - Some(match agg.to_lowercase().as_str() { - "first" => PivotAgg::First, - "sum" => PivotAgg::Sum, - "min" => PivotAgg::Min, - "max" => PivotAgg::Max, - "mean" => PivotAgg::Mean, - "median" => PivotAgg::Median, - "count" => PivotAgg::Count, - "last" => PivotAgg::Last, - _ => return fail_clierror!("Invalid pivot aggregation function: {agg}"), - }) + let lower_agg = agg.to_lowercase(); + if lower_agg == "none" { + None + } else { + Some(match lower_agg.as_str() { + "first" => PivotAgg::First, + "sum" => PivotAgg::Sum, + "min" => PivotAgg::Min, + "max" => PivotAgg::Max, + "mean" => PivotAgg::Mean, + "median" => PivotAgg::Median, + "count" => PivotAgg::Count, + "last" => PivotAgg::Last, + "smart" | "smartq" => { + if let Some(value_cols) = &value_cols { + // Try to suggest an appropriate aggregation function + if let Some(suggested_agg) = + suggest_agg_function(&args, value_cols, lower_agg == "smartq")? + { + suggested_agg + } else { + // fallback to first, which always works + PivotAgg::First + } + } else { + // Default to Count if no value columns specified + PivotAgg::Count + } + }, + _ => { + return fail_incorrectusage_clierror!( + "Invalid pivot aggregation function: {agg}" + ) + }, + }) + } } else { None }; @@ -248,6 +391,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> { b',' }; + if args.flag_decimal_comma && delim == b',' { + return fail_incorrectusage_clierror!( + "You need to specify an alternate --delimiter when using --decimal-comma." + ); + } + // Create CSV reader config let csv_reader = LazyCsvReader::new(&args.arg_input) .with_has_header(true) From 9d5e091d3d80aeff9c0738d54a37460b01554e51 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:30:33 -0500 Subject: [PATCH 2/3] tests: update `pivotp` tests to account for smart aggregation selection --- tests/test_pivotp.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_pivotp.rs b/tests/test_pivotp.rs index c6eefa344..af9bff7b6 100644 --- a/tests/test_pivotp.rs +++ b/tests/test_pivotp.rs @@ -352,7 +352,7 @@ pivotp_test!( wrk.assert_success(&mut cmd); let got: Vec> = wrk.read_stdout(&mut cmd); - let expected = vec![svec!["date;A;B"], svec!["2023-01-01;1;1"]]; + let expected = vec![svec!["date;A;B"], svec!["2023-01-01;100;150"]]; assert_eq!(got, expected); } ); @@ -549,7 +549,7 @@ pivotp_test!( wrk.assert_success(&mut cmd); let got: Vec> = wrk.read_stdout(&mut cmd); - let expected = vec![svec!["date;A;B"], svec!["2023-01-01;1;1"]]; + let expected = vec![svec!["date;A;B"], svec!["2023-01-01;100.5;150.75"]]; assert_eq!(got, expected); } ); @@ -577,8 +577,8 @@ pivotp_test!( let got: Vec> = wrk.read_stdout(&mut cmd); let expected = vec![ svec!["date", "A", "B"], - svec!["2023-01-01", "2", "1"], - svec!["2023-01-02", "1", "2"], + svec!["2023-01-01", "300", "150"], + svec!["2023-01-02", "300", "600"], ]; assert_eq!(got, expected); } @@ -604,8 +604,8 @@ pivotp_test!( let got: Vec> = wrk.read_stdout(&mut cmd); let expected = vec![ svec!["date", "A", "B"], - svec!["2023-01-01", "2", "1"], - svec!["2023-01-02", "1", "2"], + svec!["2023-01-01", "300", "150"], + svec!["2023-01-02", "300", "600"], ]; assert_eq!(got, expected); } From 3bcdac040530cd884e08052bebafee392bed7dd8 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 22 Dec 2024 12:32:51 -0500 Subject: [PATCH 3/3] docs: mention `pivotp`'s ability to auto-pick an aggregation based on column data type & stats --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0aa773ea0..3fbfede4b 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ | [lens](/src/cmd/lens.rs#L2)✨ | Interactively view, search & filter a CSV using the [csvlens](https://github.com/YS-L/csvlens#csvlens) engine. |
[luau](/src/cmd/luau.rs#L2) 👑✨
📇🌐🔣📚 ![CKAN](docs/images/ckan.png) | Create multiple new computed columns, filter rows, compute aggregations and build complex data pipelines by executing a [Luau](https://luau-lang.org) [0.653](https://github.com/Roblox/luau/releases/tag/0.653) expression/script for every row of a CSV file ([sequential mode](https://github.com/dathere/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L254-L298)), or using [random access](https://www.webopedia.com/definitions/random-access/) with an index ([random access mode](https://github.com/dathere/qsv/blob/bb72c4ef369d192d85d8b7cc6e972c1b7df77635/tests/test_luau.rs#L367-L415)).
Can process a single Luau expression or [full-fledged data-wrangling scripts using lookup tables](https://github.com/dathere/qsv-lookup-tables#example) with discrete BEGIN, MAIN and END sections.
It is not just another qsv command, it is qsv's [Domain-specific Language](https://en.wikipedia.org/wiki/Domain-specific_language) (DSL) with [numerous qsv-specific helper functions](https://github.com/dathere/qsv/blob/113eee17b97882dc368b2e65fec52b86df09f78b/src/cmd/luau.rs#L1356-L2290) to build production data pipelines. | | [partition](/src/cmd/partition.rs#L2)
👆 | Partition a CSV based on a column value. | -| [pivotp](/src/cmd/pivotp.rs#L2)✨
🚀🐻‍❄️🪄 | Pivot CSV data. | +| [pivotp](/src/cmd/pivotp.rs#L2)✨
🚀🐻‍❄️🪄 | Pivot CSV data. Features "smart" aggregation auto-selection based on data type & stats. | | [pro](/src/cmd/pro.rs#L2) | Interact with the [qsv pro](https://qsvpro.dathere.com) API. | | [prompt](/src/cmd/prompt.rs#L2)✨ | Open a file dialog to either pick a file as input or save output to a file. | | [pseudo](/src/cmd/pseudo.rs#L2)
🔣👆 | [Pseudonymise](https://en.wikipedia.org/wiki/Pseudonymization) the value of the given column by replacing them with an incremental identifier. |