Skip to content

Commit

Permalink
fix(rust): inconsistent order of partitioning columns (#2494) (#2614)
Browse files Browse the repository at this point in the history
# Description

We need stable order between logical and physical schemas, but the order
of partitioning columns is not always the same in the json schema and
the array. This is visible only when you have more than one partitioning
column.

This can surface in different ways:
- select a partitioning column and get the values from the other
- filter by a  partitioning column and get the wrong results (#2494)

# Related Issue(s)
- closes #2494

---------

Co-authored-by: R. Tyler Croy <[email protected]>
  • Loading branch information
aditanase and rtyler authored Jul 11, 2024
1 parent 197a474 commit f432c4f
Showing 1 changed file with 24 additions and 23 deletions.
47 changes: 24 additions & 23 deletions crates/core/src/delta_datafusion/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,34 +191,35 @@ impl DataFusionMixins for DeltaTableState {

fn _arrow_schema(snapshot: &Snapshot, wrap_partitions: bool) -> DeltaResult<ArrowSchemaRef> {
let meta = snapshot.metadata();
let fields = meta
.schema()?

let schema = meta.schema()?;
let fields = schema
.fields()
.filter(|f| !meta.partition_columns.contains(&f.name().to_string()))
.map(|f| f.try_into())
.chain(
meta.schema()?
.fields()
.filter(|f| meta.partition_columns.contains(&f.name().to_string()))
.map(|f| {
let field = Field::try_from(f)?;
let corrected = if wrap_partitions {
match field.data_type() {
// Only dictionary-encode types that may be large
// // https://github.com/apache/arrow-datafusion/pull/5545
DataType::Utf8
| DataType::LargeUtf8
| DataType::Binary
| DataType::LargeBinary => {
wrap_partition_type_in_dict(field.data_type().clone())
}
_ => field.data_type().clone(),
// We need stable order between logical and physical schemas, but the order of
// partitioning columns is not always the same in the json schema and the array
meta.partition_columns.iter().map(|partition_col| {
let f = schema.field(partition_col).unwrap();
let field = Field::try_from(f)?;
let corrected = if wrap_partitions {
match field.data_type() {
// Only dictionary-encode types that may be large
// // https://github.com/apache/arrow-datafusion/pull/5545
DataType::Utf8
| DataType::LargeUtf8
| DataType::Binary
| DataType::LargeBinary => {
wrap_partition_type_in_dict(field.data_type().clone())
}
} else {
field.data_type().clone()
};
Ok(field.with_data_type(corrected))
}),
_ => field.data_type().clone(),
}
} else {
field.data_type().clone()
};
Ok(field.with_data_type(corrected))
}),
)
.collect::<Result<Vec<Field>, _>>()?;

Expand Down

0 comments on commit f432c4f

Please sign in to comment.