From 5c025cc8062558fee586a88d49e1d6de433a86be Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 13 Apr 2023 22:13:33 +0100 Subject: [PATCH] Don't use parquet file offset for file range pruning (#5997) * Don't use parquet file offset for file range pruning * Update datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs Co-authored-by: Andrew Lamb * Format * Tweak logic * Update test --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/physical_plan/file_format/parquet.rs | 8 ++++---- .../src/physical_plan/file_format/parquet/row_groups.rs | 8 +++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index 349fa68a4b99..c69cdb7417f5 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -1619,11 +1619,11 @@ mod tests { .infer_schema(&state, &store, &[meta.clone()]) .await?; - let group_empty = vec![vec![file_range(&meta, 0, 5)]]; - let group_contain = vec![vec![file_range(&meta, 5, i64::MAX)]]; + let group_empty = vec![vec![file_range(&meta, 0, 2)]]; + let group_contain = vec![vec![file_range(&meta, 2, i64::MAX)]]; let group_all = vec![vec![ - file_range(&meta, 0, 5), - file_range(&meta, 5, i64::MAX), + file_range(&meta, 0, 2), + file_range(&meta, 2, i64::MAX), ]]; assert_parquet_read(&state, group_empty, None, file_schema.clone()).await?; diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs index 376ae35c66d9..86cf06620c20 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs @@ -53,7 +53,13 @@ pub(crate) fn prune_row_groups( let mut filtered = Vec::with_capacity(groups.len()); for (idx, metadata) in groups.iter().enumerate() { if let Some(range) = &range { - let offset = metadata.column(0).file_offset(); + // figure out where the first dictionary page (or first data page are) + // note don't use the location of metadata + // + let col = metadata.column(0); + let offset = col + .dictionary_page_offset() + .unwrap_or_else(|| col.data_page_offset()); if offset < range.start || offset >= range.end { continue; }