Skip to content

Commit

Permalink
feat: add column statistics into explain (#8112)
Browse files Browse the repository at this point in the history
* feat: add column statistics into explain

* feat: only show non-absent statistics

* fix: update test output
  • Loading branch information
NGA-TRAN authored Nov 12, 2023
1 parent 5a2e0ba commit 9e012a6
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 6 deletions.
39 changes: 38 additions & 1 deletion datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,44 @@ impl Statistics {

impl Display for Statistics {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Rows={}, Bytes={}", self.num_rows, self.total_byte_size)?;
// string of column statistics
let column_stats = self
.column_statistics
.iter()
.enumerate()
.map(|(i, cs)| {
let s = format!("(Col[{}]:", i);
let s = if cs.min_value != Precision::Absent {
format!("{} Min={}", s, cs.min_value)
} else {
s
};
let s = if cs.max_value != Precision::Absent {
format!("{} Max={}", s, cs.max_value)
} else {
s
};
let s = if cs.null_count != Precision::Absent {
format!("{} Null={}", s, cs.null_count)
} else {
s
};
let s = if cs.distinct_count != Precision::Absent {
format!("{} Distinct={}", s, cs.distinct_count)
} else {
s
};

s + ")"
})
.collect::<Vec<_>>()
.join(",");

write!(
f,
"Rows={}, Bytes={}, [{}]",
self.num_rows, self.total_byte_size, column_stats
)?;

Ok(())
}
Expand Down
5 changes: 4 additions & 1 deletion datafusion/core/tests/sql/explain_analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -827,5 +827,8 @@ async fn csv_explain_analyze_with_statistics() {
.to_string();

// should contain scan statistics
assert_contains!(&formatted, ", statistics=[Rows=Absent, Bytes=Absent]");
assert_contains!(
&formatted,
", statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:)]]"
);
}
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/explain.slt
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ query TT
EXPLAIN SELECT a, b, c FROM simple_explain_test limit 10;
----
physical_plan
GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Inexact(10), Bytes=Absent]
--CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], limit=10, has_header=true, statistics=[Rows=Absent, Bytes=Absent]
GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Inexact(10), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]]
--CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], limit=10, has_header=true, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]]

# Parquet scan with statistics collected
statement ok
Expand All @@ -288,8 +288,8 @@ query TT
EXPLAIN SELECT * FROM alltypes_plain limit 10;
----
physical_plan
GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent]
--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent]
GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]

statement ok
set datafusion.execution.collect_statistics = false;
Expand Down

0 comments on commit 9e012a6

Please sign in to comment.