Skip to content

Commit

Permalink
Don't use parquet file offset for file range pruning (#5997)
Browse files Browse the repository at this point in the history
* Don't use parquet file offset for file range pruning

* Update datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs

Co-authored-by: Andrew Lamb <[email protected]>

* Format

* Tweak logic

* Update test

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
tustvold and alamb authored Apr 13, 2023
1 parent 59fd93c commit 5c025cc
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
8 changes: 4 additions & 4 deletions datafusion/core/src/physical_plan/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1619,11 +1619,11 @@ mod tests {
.infer_schema(&state, &store, &[meta.clone()])
.await?;

let group_empty = vec![vec![file_range(&meta, 0, 5)]];
let group_contain = vec![vec![file_range(&meta, 5, i64::MAX)]];
let group_empty = vec![vec![file_range(&meta, 0, 2)]];
let group_contain = vec![vec![file_range(&meta, 2, i64::MAX)]];
let group_all = vec![vec![
file_range(&meta, 0, 5),
file_range(&meta, 5, i64::MAX),
file_range(&meta, 0, 2),
file_range(&meta, 2, i64::MAX),
]];

assert_parquet_read(&state, group_empty, None, file_schema.clone()).await?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,13 @@ pub(crate) fn prune_row_groups(
let mut filtered = Vec::with_capacity(groups.len());
for (idx, metadata) in groups.iter().enumerate() {
if let Some(range) = &range {
let offset = metadata.column(0).file_offset();
// figure out where the first dictionary page (or first data page are)
// note don't use the location of metadata
// <https://github.com/apache/arrow-datafusion/issues/5995>
let col = metadata.column(0);
let offset = col
.dictionary_page_offset()
.unwrap_or_else(|| col.data_page_offset());
if offset < range.start || offset >= range.end {
continue;
}
Expand Down

0 comments on commit 5c025cc

Please sign in to comment.