From 5fdde54ec62e1c68adc14788fb209173376883b1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 13 Apr 2023 19:40:03 +0100 Subject: [PATCH 1/5] Don't use parquet file offset for file range pruning --- .../src/physical_plan/file_format/parquet/row_groups.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs index 376ae35c66d9..37ae4e189dc5 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs @@ -53,7 +53,11 @@ pub(crate) fn prune_row_groups( let mut filtered = Vec::with_capacity(groups.len()); for (idx, metadata) in groups.iter().enumerate() { if let Some(range) = &range { - let offset = metadata.column(0).file_offset(); + let col = metadata.column(0); + let offset = col + .dictionary_page_offset() + .unwrap_or(0) + .max(col.data_page_offset()); if offset < range.start || offset >= range.end { continue; } From f5df27de35cc1c2f5d3b6d6f44ddca0b54c7c052 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 13 Apr 2023 20:27:10 +0100 Subject: [PATCH 2/5] Update datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs Co-authored-by: Andrew Lamb --- .../core/src/physical_plan/file_format/parquet/row_groups.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs index 37ae4e189dc5..1b3d8210f6cf 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs @@ -53,6 +53,9 @@ pub(crate) fn prune_row_groups( let mut filtered = Vec::with_capacity(groups.len()); for (idx, metadata) in groups.iter().enumerate() { if let Some(range) = &range { + // figure out where the first dictionary page (or first data page are) + // note don't use the location of metadata + // let col = metadata.column(0); let offset = col .dictionary_page_offset() From b6ced94d69772fa4b20f67220b6313cacf6a4cb9 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 13 Apr 2023 20:56:12 +0100 Subject: [PATCH 3/5] Format --- .../src/physical_plan/file_format/parquet/row_groups.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs index 1b3d8210f6cf..67533ff9cab0 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs @@ -53,9 +53,9 @@ pub(crate) fn prune_row_groups( let mut filtered = Vec::with_capacity(groups.len()); for (idx, metadata) in groups.iter().enumerate() { if let Some(range) = &range { - // figure out where the first dictionary page (or first data page are) - // note don't use the location of metadata - // + // figure out where the first dictionary page (or first data page are) + // note don't use the location of metadata + // let col = metadata.column(0); let offset = col .dictionary_page_offset() From 18b81eadf2d3449dcc158eba974567334ba02e70 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 13 Apr 2023 20:56:57 +0100 Subject: [PATCH 4/5] Tweak logic --- .../core/src/physical_plan/file_format/parquet/row_groups.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs index 67533ff9cab0..86cf06620c20 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs @@ -59,8 +59,7 @@ pub(crate) fn prune_row_groups( let col = metadata.column(0); let offset = col .dictionary_page_offset() - .unwrap_or(0) - .max(col.data_page_offset()); + .unwrap_or_else(|| col.data_page_offset()); if offset < range.start || offset >= range.end { continue; } From 13082c920188284cf06c9c46d86a811ec7c65315 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 13 Apr 2023 21:18:08 +0100 Subject: [PATCH 5/5] Update test --- datafusion/core/src/physical_plan/file_format/parquet.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index 92be32f47649..8c5b5e4de8ea 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -1622,11 +1622,11 @@ mod tests { .infer_schema(&state, &store, &[meta.clone()]) .await?; - let group_empty = vec![vec![file_range(&meta, 0, 5)]]; - let group_contain = vec![vec![file_range(&meta, 5, i64::MAX)]]; + let group_empty = vec![vec![file_range(&meta, 0, 2)]]; + let group_contain = vec![vec![file_range(&meta, 2, i64::MAX)]]; let group_all = vec![vec![ - file_range(&meta, 0, 5), - file_range(&meta, 5, i64::MAX), + file_range(&meta, 0, 2), + file_range(&meta, 2, i64::MAX), ]]; assert_parquet_read(&state, group_empty, None, file_schema.clone()).await?;