Skip to content

Commit

Permalink
feat: improve string statistics display (#8535)
Browse files Browse the repository at this point in the history
  • Loading branch information
asimsedhain authored Dec 14, 2023
1 parent 974d49c commit 1042095
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 26 deletions.
77 changes: 51 additions & 26 deletions datafusion-cli/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use datafusion::logical_expr::Expr;
use datafusion::physical_plan::memory::MemoryExec;
use datafusion::physical_plan::ExecutionPlan;
use datafusion::scalar::ScalarValue;
use parquet::basic::ConvertedType;
use parquet::file::reader::FileReader;
use parquet::file::serialized_reader::SerializedFileReader;
use parquet::file::statistics::Statistics;
Expand Down Expand Up @@ -246,6 +247,52 @@ impl TableProvider for ParquetMetadataTable {
}
}

fn convert_parquet_statistics(
value: &Statistics,
converted_type: ConvertedType,
) -> (String, String) {
match (value, converted_type) {
(Statistics::Boolean(val), _) => (val.min().to_string(), val.max().to_string()),
(Statistics::Int32(val), _) => (val.min().to_string(), val.max().to_string()),
(Statistics::Int64(val), _) => (val.min().to_string(), val.max().to_string()),
(Statistics::Int96(val), _) => (val.min().to_string(), val.max().to_string()),
(Statistics::Float(val), _) => (val.min().to_string(), val.max().to_string()),
(Statistics::Double(val), _) => (val.min().to_string(), val.max().to_string()),
(Statistics::ByteArray(val), ConvertedType::UTF8) => {
let min_bytes = val.min();
let max_bytes = val.max();
let min = min_bytes
.as_utf8()
.map(|v| v.to_string())
.unwrap_or_else(|_| min_bytes.to_string());

let max = max_bytes
.as_utf8()
.map(|v| v.to_string())
.unwrap_or_else(|_| max_bytes.to_string());
(min, max)
}
(Statistics::ByteArray(val), _) => (val.min().to_string(), val.max().to_string()),
(Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => {
let min_bytes = val.min();
let max_bytes = val.max();
let min = min_bytes
.as_utf8()
.map(|v| v.to_string())
.unwrap_or_else(|_| min_bytes.to_string());

let max = max_bytes
.as_utf8()
.map(|v| v.to_string())
.unwrap_or_else(|_| max_bytes.to_string());
(min, max)
}
(Statistics::FixedLenByteArray(val), _) => {
(val.min().to_string(), val.max().to_string())
}
}
}

pub struct ParquetMetadataFunc {}

impl TableFunctionImpl for ParquetMetadataFunc {
Expand Down Expand Up @@ -326,34 +373,12 @@ impl TableFunctionImpl for ParquetMetadataFunc {
num_values_arr.push(column.num_values());
path_in_schema_arr.push(column.column_path().to_string());
type_arr.push(column.column_type().to_string());
let converted_type = column.column_descr().converted_type();

if let Some(s) = column.statistics() {
let (min_val, max_val) = if s.has_min_max_set() {
let (min_val, max_val) = match s {
Statistics::Boolean(val) => {
(val.min().to_string(), val.max().to_string())
}
Statistics::Int32(val) => {
(val.min().to_string(), val.max().to_string())
}
Statistics::Int64(val) => {
(val.min().to_string(), val.max().to_string())
}
Statistics::Int96(val) => {
(val.min().to_string(), val.max().to_string())
}
Statistics::Float(val) => {
(val.min().to_string(), val.max().to_string())
}
Statistics::Double(val) => {
(val.min().to_string(), val.max().to_string())
}
Statistics::ByteArray(val) => {
(val.min().to_string(), val.max().to_string())
}
Statistics::FixedLenByteArray(val) => {
(val.min().to_string(), val.max().to_string())
}
};
let (min_val, max_val) =
convert_parquet_statistics(s, converted_type);
(Some(min_val), Some(max_val))
} else {
(None, None)
Expand Down
24 changes: 24 additions & 0 deletions datafusion-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -420,4 +420,28 @@ mod tests {

Ok(())
}

#[tokio::test]
async fn test_parquet_metadata_works_with_strings() -> Result<(), DataFusionError> {
let ctx = SessionContext::new();
ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {}));

// input with string columns
let sql =
"SELECT * FROM parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')";
let df = ctx.sql(sql).await?;
let rbs = df.collect().await?;

let excepted = [

"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+",
"| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |",
"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+",
"| ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | \"String\" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 |",
"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+"
];
assert_batches_eq!(excepted, &rbs);

Ok(())
}
}

0 comments on commit 1042095

Please sign in to comment.