Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Limit together with pushdown_filters #13788

Merged
merged 4 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion datafusion/core/src/datasource/listing/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -843,8 +843,13 @@ impl TableProvider for ListingTable {
});
// TODO (https://github.com/apache/datafusion/issues/11600) remove downcast_ref from here?
let session_state = state.as_any().downcast_ref::<SessionState>().unwrap();

// We should not limit the number of partitioned files to scan if there are filters and limit
alamb marked this conversation as resolved.
Show resolved Hide resolved
// at the same time. This is because the limit should be applied after the filters are applied.
let statistic_file_limit = if filters.is_empty() { limit } else { None };

let (mut partitioned_file_lists, statistics) = self
.list_files_for_scan(session_state, &partition_filters, limit)
.list_files_for_scan(session_state, &partition_filters, statistic_file_limit)
.await?;

// if no files need to be read, return an `EmptyExec`
Expand Down
58 changes: 58 additions & 0 deletions datafusion/sqllogictest/test_files/push_down_filter.slt
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,61 @@ logical_plan

statement ok
drop table d;


# Test push down filter with limit for parquet
statement ok
set datafusion.execution.parquet.pushdown_filters = true;

# this one is also required to make DF skip second file due to "sufficient" amount of rows
statement ok
set datafusion.execution.collect_statistics = true;

# Create a table as a data source
statement ok
CREATE TABLE src_table (
part_key INT,
value INT
) AS VALUES(1, 0), (1, 1), (1, 100), (2, 0), (2, 2), (2, 2), (2, 100);


# There will be 2 records filtered from the table to check that `limit 1` actually applied.
# Setup 2 files, i.e., as many as there are partitions:

# File 1:
query I
COPY (SELECT * FROM src_table where part_key = 1)
TO 'test_files/scratch/parquet/test_filter_with_limit/0.parquet'
STORED AS PARQUET;
----
3

# File 2:
query I
COPY (SELECT * FROM src_table where part_key = 2)
TO 'test_files/scratch/parquet/test_filter_with_limit/1.parquet'
STORED AS PARQUET;
----
4

statement ok
CREATE EXTERNAL TABLE test_filter_with_limit
(
part_key INT,
value INT
)
STORED AS PARQUET
LOCATION 'test_files/scratch/parquet/test_filter_with_limit/';

query II
select * from test_filter_with_limit where value = 2 limit 1;
----
2 2

# Tear down test_filter_with_limit table:
statement ok
DROP TABLE test_filter_with_limit;

# Tear down src_table table:
statement ok
DROP TABLE src_table;
Loading