From 62772f102d4cc7ea3477e90d6d3d4adbfc2bace1 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 26 Jun 2024 17:48:02 -0700 Subject: [PATCH 01/54] test for json --- kernel/src/engine/default/json.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index b51970e6c..048b3d2ec 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -220,6 +220,7 @@ impl FileOpener for JsonOpener { mod tests { use std::path::PathBuf; + use arrow::array::AsArray; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use itertools::Itertools; use object_store::{local::LocalFileSystem, ObjectStore}; @@ -256,6 +257,28 @@ mod tests { assert_eq!(batch.length(), 4); } + #[test] + fn test_parse_json_drop_field() { + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + let json_strings = StringArray::from(vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2, "maxRowId": 3}}}"# + ]); + let output_schema = Arc::new(get_log_schema().clone()); + + let batch: RecordBatch = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap() + .into_any() + .downcast::() + .map(|sd| sd.into()).unwrap(); + assert_eq!(batch.column(0).len(), 1); + let add_array = batch.column_by_name("add").unwrap().as_struct(); + let dv_col = add_array.column_by_name("deletionVector").unwrap().as_struct(); + assert!(dv_col.column_by_name("storageType").is_some()); + assert!(dv_col.column_by_name("maxRowId").is_none()); + } + #[tokio::test] async fn test_read_json_files() { let store = Arc::new(LocalFileSystem::new()); From 3e9665c9fc388ecc8479f191d00192526b8beb3b Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 28 Jun 2024 14:01:54 -0700 Subject: [PATCH 02/54] checkpoint before ReorderIndex --- kernel/src/engine/arrow_utils.rs | 445 ++++++++++++++++++++++++++++-- kernel/src/engine/default/json.rs | 10 +- 2 files changed, 423 insertions(+), 32 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 1e15199e4..c3d951538 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -2,62 +2,175 @@ use std::sync::Arc; -use crate::{schema::SchemaRef, utils::require, DeltaResult, Error}; +use crate::{ + schema::{DataType, Schema, SchemaRef, StructField, StructType}, + utils::require, + DeltaResult, Error, +}; use arrow_array::RecordBatch; -use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use arrow_schema::{ + DataType as ArrowDataType, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, +}; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; -/// Get the indicies in `parquet_schema` of the specified columns in `requested_schema`. This -/// returns a tuples of (mask_indicies: Vec, reorder_indicies: -/// Vec). `mask_indicies` is used for generating the mask for reading from the +/// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index +/// represents a column that will be in the read parquet data at that level and index. If the value +/// stored is an `Index` variant, the associated `usize` is the position that the column should +/// appear in the final output. If it is a `Child` variant, then at that index there is a `Struct` +/// whose ordering is specified by the values in the associated `Vec` according to these same rules. +enum ReorderIndex { + Index(usize), + Child(Vec), +} + +/// helper function, does the same as `get_requested_indices` but at an offset. used to recurse into +/// structs. this is called recursively to traverse into structs and lists. `parquet_offset` is how +/// many parquet fields exist before processing this potentially nested schema. `reorder_offset` is +/// how many fields we've found so far before processing at this nested schema. returns the number +/// of parquet fields and the number of requested fields processed +fn get_indices( + start_parquet_offset: usize, + start_reorder_offset: usize, + requested_schema: &Schema, + fields: &Fields, + mask_indices: &mut Vec, + reorder_indices: &mut Vec, +) -> DeltaResult<(usize, usize)> { + let mut found_fields = 0; + let mut parquet_offset = start_parquet_offset; + let mut reorder_offset = start_reorder_offset; + println!("at top with parquet_offset {parquet_offset} and reorder_offset {reorder_offset}"); + for (parquet_index, field) in fields.iter().enumerate() { + println!("looking at field {}", field.name()); + match field.data_type() { + ArrowDataType::Struct(fields) => { + if let Some(requested_field) = requested_schema.fields.get(field.name()) { + match requested_field.data_type { + DataType::Struct(ref requested_schema) => { + let (parquet_advance, reorder_advance) = get_indices( + found_fields + parquet_offset, + found_fields + reorder_offset, + requested_schema.as_ref(), + fields, + mask_indices, + reorder_indices, + )?; + // advance the number of parquet fields, but subtract 1 because the + // struct will be counted by the `enumerate` call but doesn't count as + // an actual index. + println!("here:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}\n reorder_offset: {reorder_offset}\n reorder_advance: {reorder_advance}"); + parquet_offset += parquet_advance - 1; + // advance the reorder offset + reorder_offset += reorder_advance; + // also increase found_fields because the struct is a field we've found + // and will count in the `requested_schema.fields.len()` call below + found_fields += 1; + } + _ => { + return Err(Error::unexpected_column_type(field.name())); + } + } + } + } + ArrowDataType::List(list_field) | ArrowDataType::ListView(list_field) => { + if let Some(requested_field) = requested_schema.fields.get(field.name()) { + // we just want to transparently recurse into lists, need to transform the kernel + // list data type into a schema + match requested_field.data_type() { + DataType::Array(array_type) => { + let requested_schema = StructType::new(vec![StructField::new( + list_field.name().clone(), // so we find it in the inner call + array_type.element_type.clone(), + array_type.contains_null, + )]); + let (parquet_advance, reorder_advance) = get_indices( + found_fields + parquet_offset, + found_fields + reorder_offset, + &requested_schema, + &[list_field.clone()].into(), + mask_indices, + reorder_indices, + )?; + // see comment above in struct match arm + println!("here list:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}\n reorder_offset: {reorder_offset}\n reorder_advance: {reorder_advance}"); + parquet_offset += parquet_advance - 1; + reorder_offset += reorder_advance - 1; // inner array field doesn't count + found_fields += 1; + } + _ => { + return Err(Error::unexpected_column_type(list_field.name())); + } + } + } + } + _ => { + if let Some(index) = requested_schema.index_of(field.name()) { + found_fields += 1; + mask_indices.push(parquet_offset + parquet_index); + reorder_indices.push(reorder_offset + index); // this should add the number of FOUND fields in total so far, not offset_index + } + } + } + } + println!("found {found_fields}"); + //println!("found {found_fields}, requested {}. req schema: {:?}", requested_schema.fields.len(), requested_schema); + require!( + found_fields == requested_schema.fields.len(), + Error::generic("Didn't find all requested columns in parquet schema") + ); + Ok(( + parquet_offset + fields.len() - start_parquet_offset, + reorder_offset + requested_schema.fields.len() - start_reorder_offset, + )) +} + +/// Get the indices in `parquet_schema` of the specified columns in `requested_schema`. This +/// returns a tuples of (mask_indices: Vec, reorder_indices: +/// Vec). `mask_indices` is used for generating the mask for reading from the /// parquet file, and simply contains an entry for each index we wish to select from the parquet -/// file set to the index of the requested column in the parquet. `reorder_indicies` is used for -/// re-ordering and will be the same size as `requested_schema`. Each index in `reorder_indicies` +/// file set to the index of the requested column in the parquet. `reorder_indices` is used for +/// re-ordering and will be the same size as `requested_schema`. Each index in `reorder_indices` /// represents a column that will be in the read parquet data at that index. The value stored in -/// `reorder_indicies` is the position that the column should appear in the final output. For -/// example, if `reorder_indicies` is `[2,0,1]`, then the re-ordering code should take the third +/// `reorder_indices` is the position that the column should appear in the final output. For +/// example, if `reorder_indices` is `[2,0,1]`, then the re-ordering code should take the third /// column in the raw-read parquet data, and move it to the first column in the final output, the /// first column to the second, and the second to the third. pub(crate) fn get_requested_indices( requested_schema: &SchemaRef, parquet_schema: &ArrowSchemaRef, ) -> DeltaResult<(Vec, Vec)> { - let (mask_indicies, reorder_indicies): (Vec, Vec) = parquet_schema - .fields() - .iter() - .enumerate() - .filter_map(|(parquet_index, field)| { - requested_schema - .index_of(field.name()) - .map(|index| (parquet_index, index)) - }) - .unzip(); - require!( - mask_indicies.len() == requested_schema.fields.len(), - Error::generic("Didn't find all requested columns in parquet schema") - ); - Ok((mask_indicies, reorder_indicies)) + let mut mask_indices = vec![]; + let mut reorder_indices = vec![]; + get_indices( + 0, + 0, + requested_schema, + parquet_schema.fields(), + &mut mask_indices, + &mut reorder_indices, + )?; + Ok((mask_indices, reorder_indices)) } -/// Create a mask that will only select the specified indicies from the parquet. Currently we only +/// Create a mask that will only select the specified indices from the parquet. Currently we only /// handle "root" level columns, and hence use `ProjectionMask::roots`, but will support leaf /// selection in the future. See issues #86 and #96 as well. pub(crate) fn generate_mask( requested_schema: &SchemaRef, parquet_schema: &ArrowSchemaRef, parquet_physical_schema: &SchemaDescriptor, - indicies: &[usize], + indices: &[usize], ) -> Option { if parquet_schema.fields.size() == requested_schema.fields.len() { - // we assume that in get_requested_indicies we will have caught any column name mismatches, + // we assume that in get_requested_indices we will have caught any column name mismatches, // so here we can just say that if we request the same # of columns as the parquet file // actually has, we don't need to mask anything out None } else { Some(ProjectionMask::roots( parquet_physical_schema, - indicies.to_owned(), + indices.to_owned(), )) } } @@ -69,7 +182,7 @@ pub(crate) fn reorder_record_batch( requested_ordering: &[usize], ) -> DeltaResult { if requested_ordering.windows(2).all(|is| is[0] < is[1]) { - // indicies is already sorted, meaning we requested in the order that the columns were + // indices is already sorted, meaning we requested in the order that the columns were // stored in the parquet Ok(input_data) } else { @@ -87,3 +200,277 @@ pub(crate) fn reorder_record_batch( Ok(RecordBatch::try_new(schema, reordered_columns)?) } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema}; + + use crate::schema::{ArrayType, DataType, StructField, StructType}; + + use super::get_requested_indices; + + #[test] + fn simple_mask_indices() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new("s", DataType::STRING, true), + StructField::new("i2", DataType::INTEGER, true), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("s", ArrowDataType::Utf8, true), + ArrowField::new("i2", ArrowDataType::Int32, true), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect = vec![0, 1, 2]; + assert_eq!(mask_indices, expect); + assert_eq!(reorder_indices, expect); + } + + #[test] + fn simple_reorder_indices() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new("s", DataType::STRING, true), + StructField::new("i2", DataType::INTEGER, true), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i2", ArrowDataType::Int32, true), + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("s", ArrowDataType::Utf8, true), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 2]; + let expect_reorder = vec![2, 0, 1]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn nested_indices() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new( + "nested", + StructType::new(vec![ + StructField::new("int32", DataType::INTEGER, false), + StructField::new("string", DataType::STRING, false), + ]), + false, + ), + StructField::new("j", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 2, 3]; + let expect_reorder = vec![0, 1, 2, 3]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn nested_indices_reorder() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("j", DataType::INTEGER, false), + StructField::new( + "nested", + StructType::new(vec![ + StructField::new("string", DataType::STRING, false), + StructField::new("int32", DataType::INTEGER, false), + ]), + false, + ), + StructField::new("i", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 2, 3]; + let expect_reorder = vec![0, 1, 2, 3]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn nested_indices_mask_inner() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new( + "nested", + StructType::new(vec![StructField::new("int32", DataType::INTEGER, false)]), + false, + ), + StructField::new("j", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 3]; + let expect_reorder = vec![0, 1, 3]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn simple_list_mask() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new("list", ArrayType::new(DataType::INTEGER, false), false), + StructField::new("j", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "nested", + ArrowDataType::Int32, + false, + ))), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 2]; + let expect_reorder = vec![0, 1, 2]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn nested_indices_list_mask_inner() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new( + "list", + ArrayType::new( + StructType::new(vec![StructField::new("int32", DataType::INTEGER, false)]) + .into(), + false, + ), + false, + ), + StructField::new("j", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ))), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 3]; + let expect_reorder = vec![0, 1, 3]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn nested_indices_list_mask_inner_reorder() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new( + "list", + ArrayType::new( + StructType::new(vec![ + StructField::new("string", DataType::INTEGER, false), + StructField::new("int2", DataType::INTEGER, false), + ]) + .into(), + false, + ), + false, + ), + StructField::new("j", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), // field 0 + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int1", ArrowDataType::Int32, false), // field 1 + ArrowField::new("int2", ArrowDataType::Int32, false), // field 2 + ArrowField::new("string", ArrowDataType::Utf8, false), // field 3 + ] + .into(), + ), + false, + ))), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), // field 4 + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 2, 3, 4]; + let expect_reorder = vec![0, 2, 1, 3]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } +} diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index 048b3d2ec..6c6370863 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -262,7 +262,7 @@ mod tests { let store = Arc::new(LocalFileSystem::new()); let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); let json_strings = StringArray::from(vec![ - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2, "maxRowId": 3}}}"# + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2, "maxRowId": 3}}}"#, ]); let output_schema = Arc::new(get_log_schema().clone()); @@ -271,10 +271,14 @@ mod tests { .unwrap() .into_any() .downcast::() - .map(|sd| sd.into()).unwrap(); + .map(|sd| sd.into()) + .unwrap(); assert_eq!(batch.column(0).len(), 1); let add_array = batch.column_by_name("add").unwrap().as_struct(); - let dv_col = add_array.column_by_name("deletionVector").unwrap().as_struct(); + let dv_col = add_array + .column_by_name("deletionVector") + .unwrap() + .as_struct(); assert!(dv_col.column_by_name("storageType").is_some()); assert!(dv_col.column_by_name("maxRowId").is_none()); } From 16c784675a3fbdb90307d6d6faf70d68346ff017 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 28 Jun 2024 14:39:24 -0700 Subject: [PATCH 03/54] working with ReorderIndex enum --- kernel/src/engine/arrow_utils.rs | 155 ++++++++++++++++++++------- kernel/src/engine/default/parquet.rs | 4 +- kernel/src/engine/sync/parquet.rs | 10 +- 3 files changed, 123 insertions(+), 46 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index c3d951538..0f7058e68 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -19,7 +19,8 @@ use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; /// stored is an `Index` variant, the associated `usize` is the position that the column should /// appear in the final output. If it is a `Child` variant, then at that index there is a `Struct` /// whose ordering is specified by the values in the associated `Vec` according to these same rules. -enum ReorderIndex { +#[derive(Debug, PartialEq)] +pub(crate) enum ReorderIndex { Index(usize), Child(Vec), } @@ -31,41 +32,39 @@ enum ReorderIndex { /// of parquet fields and the number of requested fields processed fn get_indices( start_parquet_offset: usize, - start_reorder_offset: usize, requested_schema: &Schema, fields: &Fields, mask_indices: &mut Vec, - reorder_indices: &mut Vec, -) -> DeltaResult<(usize, usize)> { +) -> DeltaResult<(usize, Vec)> { let mut found_fields = 0; let mut parquet_offset = start_parquet_offset; - let mut reorder_offset = start_reorder_offset; - println!("at top with parquet_offset {parquet_offset} and reorder_offset {reorder_offset}"); + let mut reorder_indices = vec![]; + //println!("at top with parquet_offset {parquet_offset}"); for (parquet_index, field) in fields.iter().enumerate() { - println!("looking at field {}", field.name()); + //println!("looking at field {}", field.name()); match field.data_type() { ArrowDataType::Struct(fields) => { - if let Some(requested_field) = requested_schema.fields.get(field.name()) { + if let Some((_index, _, requested_field)) = + requested_schema.fields.get_full(field.name()) + { match requested_field.data_type { DataType::Struct(ref requested_schema) => { - let (parquet_advance, reorder_advance) = get_indices( + let (parquet_advance, child_indices) = get_indices( found_fields + parquet_offset, - found_fields + reorder_offset, requested_schema.as_ref(), fields, mask_indices, - reorder_indices, )?; // advance the number of parquet fields, but subtract 1 because the // struct will be counted by the `enumerate` call but doesn't count as // an actual index. - println!("here:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}\n reorder_offset: {reorder_offset}\n reorder_advance: {reorder_advance}"); + //println!("here:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; - // advance the reorder offset - reorder_offset += reorder_advance; // also increase found_fields because the struct is a field we've found // and will count in the `requested_schema.fields.len()` call below found_fields += 1; + // push the child reorder on + reorder_indices.push(ReorderIndex::Child(child_indices)); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -74,7 +73,9 @@ fn get_indices( } } ArrowDataType::List(list_field) | ArrowDataType::ListView(list_field) => { - if let Some(requested_field) = requested_schema.fields.get(field.name()) { + if let Some((index, _, requested_field)) = + requested_schema.fields.get_full(field.name()) + { // we just want to transparently recurse into lists, need to transform the kernel // list data type into a schema match requested_field.data_type() { @@ -84,19 +85,28 @@ fn get_indices( array_type.element_type.clone(), array_type.contains_null, )]); - let (parquet_advance, reorder_advance) = get_indices( + let (parquet_advance, child_indices) = get_indices( found_fields + parquet_offset, - found_fields + reorder_offset, &requested_schema, &[list_field.clone()].into(), mask_indices, - reorder_indices, )?; // see comment above in struct match arm - println!("here list:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}\n reorder_offset: {reorder_offset}\n reorder_advance: {reorder_advance}"); + //println!("here list:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; - reorder_offset += reorder_advance - 1; // inner array field doesn't count found_fields += 1; + // we have to recurse to find the type, but for reordering a list we + // only need a child reordering if the inner type is a struct + if let ArrowDataType::Struct(_) = list_field.data_type() { + if child_indices.len() != 1 { + return Err(Error::generic("List call should not have generated more than one reorder index")); + } + // safety, checked that we have 1 element + let child_indices = child_indices.into_iter().next().unwrap(); + reorder_indices.push(child_indices); + } else { + reorder_indices.push(ReorderIndex::Index(index)); + } } _ => { return Err(Error::unexpected_column_type(list_field.name())); @@ -108,12 +118,12 @@ fn get_indices( if let Some(index) = requested_schema.index_of(field.name()) { found_fields += 1; mask_indices.push(parquet_offset + parquet_index); - reorder_indices.push(reorder_offset + index); // this should add the number of FOUND fields in total so far, not offset_index + reorder_indices.push(ReorderIndex::Index(index)); } } } } - println!("found {found_fields}"); + //println!("found {found_fields}"); //println!("found {found_fields}, requested {}. req schema: {:?}", requested_schema.fields.len(), requested_schema); require!( found_fields == requested_schema.fields.len(), @@ -121,7 +131,7 @@ fn get_indices( ); Ok(( parquet_offset + fields.len() - start_parquet_offset, - reorder_offset + requested_schema.fields.len() - start_reorder_offset, + reorder_indices, )) } @@ -139,18 +149,15 @@ fn get_indices( pub(crate) fn get_requested_indices( requested_schema: &SchemaRef, parquet_schema: &ArrowSchemaRef, -) -> DeltaResult<(Vec, Vec)> { +) -> DeltaResult<(Vec, Vec)> { let mut mask_indices = vec![]; - let mut reorder_indices = vec![]; - get_indices( - 0, + let (_, reorder_indexes) = get_indices( 0, requested_schema, parquet_schema.fields(), &mut mask_indices, - &mut reorder_indices, )?; - Ok((mask_indices, reorder_indices)) + Ok((mask_indices, reorder_indexes)) // FIX ME } /// Create a mask that will only select the specified indices from the parquet. Currently we only @@ -209,7 +216,13 @@ mod tests { use crate::schema::{ArrayType, DataType, StructField, StructType}; - use super::get_requested_indices; + use super::{get_requested_indices, ReorderIndex}; + + macro_rules! rii { + ($index: expr) => {{ + ReorderIndex::Index($index) + }}; + } #[test] fn simple_mask_indices() { @@ -225,9 +238,10 @@ mod tests { ])); let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); - let expect = vec![0, 1, 2]; - assert_eq!(mask_indices, expect); - assert_eq!(reorder_indices, expect); + let expect_mask = vec![0, 1, 2]; + let expect_reorder = vec![rii!(0), rii!(1), rii!(2)]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); } #[test] @@ -245,7 +259,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![2, 0, 1]; + let expect_reorder = vec![rii!(2), rii!(0), rii!(1)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -282,7 +296,11 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; - let expect_reorder = vec![0, 1, 2, 3]; + let expect_reorder = vec![ + rii!(0), + ReorderIndex::Child(vec![rii!(0), rii!(1)]), + rii!(2), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -319,7 +337,11 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; - let expect_reorder = vec![0, 1, 2, 3]; + let expect_reorder = vec![ + rii!(2), + ReorderIndex::Child(vec![rii!(1), rii!(0)]), + rii!(0), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -353,7 +375,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; - let expect_reorder = vec![0, 1, 3]; + let expect_reorder = vec![rii!(0), ReorderIndex::Child(vec![rii!(0)]), rii!(2)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -381,7 +403,56 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![0, 1, 2]; + let expect_reorder = vec![rii!(0), rii!(1), rii!(2)]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn nested_indices_list() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new( + "list", + ArrayType::new( + StructType::new(vec![ + StructField::new("int32", DataType::INTEGER, false), + StructField::new("string", DataType::STRING, false), + ]) + .into(), + false, + ), + false, + ), + StructField::new("j", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ))), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 2, 3]; + let expect_reorder = vec![ + rii!(0), + ReorderIndex::Child(vec![rii!(0), rii!(1)]), + rii!(2), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -423,7 +494,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; - let expect_reorder = vec![0, 1, 3]; + let expect_reorder = vec![rii!(0), ReorderIndex::Child(vec![rii!(0)]), rii!(2)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -469,7 +540,11 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 2, 3, 4]; - let expect_reorder = vec![0, 2, 1, 3]; + let expect_reorder = vec![ + rii!(0), + ReorderIndex::Child(vec![rii!(1), rii!(0)]), + rii!(2), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index fc66d5126..1c20e0bd0 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -142,7 +142,7 @@ impl FileOpener for ParquetOpener { let stream = stream.map(move |rbr| { // re-order each batch if needed rbr.map_err(Error::Parquet) - .and_then(|rb| reorder_record_batch(rb, &requested_ordering)) + //.and_then(|rb| reorder_record_batch(rb, &requested_ordering)) FIX ME }); Ok(stream.boxed()) })) @@ -205,7 +205,7 @@ impl FileOpener for PresignedUrlOpener { let stream = stream.map(move |rbr| { // re-order each batch if needed rbr.map_err(Error::Arrow) - .and_then(|rb| reorder_record_batch(rb, &requested_ordering)) + //.and_then(|rb| reorder_record_batch(rb, &requested_ordering)) FIX ME }); Ok(stream.boxed()) })) diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 828dbe104..b847e98c0 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -29,10 +29,12 @@ fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult Date: Fri, 28 Jun 2024 14:42:09 -0700 Subject: [PATCH 04/54] factor out common arrow schema usage --- kernel/src/engine/arrow_utils.rs | 66 ++++++++++---------------------- 1 file changed, 21 insertions(+), 45 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 0f7058e68..7a0b87adb 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -224,6 +224,24 @@ mod tests { }}; } + fn nested_arrow_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])) + } + #[test] fn simple_mask_indices() { let kernel_schema = Arc::new(StructType::new(vec![ @@ -278,21 +296,7 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - ArrowField::new("i", ArrowDataType::Int32, false), - ArrowField::new( - "nested", - ArrowDataType::Struct( - vec![ - ArrowField::new("int32", ArrowDataType::Int32, false), - ArrowField::new("string", ArrowDataType::Utf8, false), - ] - .into(), - ), - false, - ), - ArrowField::new("j", ArrowDataType::Int32, false), - ])); + let arrow_schema = nested_arrow_schema(); let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; @@ -319,21 +323,7 @@ mod tests { ), StructField::new("i", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - ArrowField::new("i", ArrowDataType::Int32, false), - ArrowField::new( - "nested", - ArrowDataType::Struct( - vec![ - ArrowField::new("int32", ArrowDataType::Int32, false), - ArrowField::new("string", ArrowDataType::Utf8, false), - ] - .into(), - ), - false, - ), - ArrowField::new("j", ArrowDataType::Int32, false), - ])); + let arrow_schema = nested_arrow_schema(); let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; @@ -357,21 +347,7 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - ArrowField::new("i", ArrowDataType::Int32, false), - ArrowField::new( - "nested", - ArrowDataType::Struct( - vec![ - ArrowField::new("int32", ArrowDataType::Int32, false), - ArrowField::new("string", ArrowDataType::Utf8, false), - ] - .into(), - ), - false, - ), - ArrowField::new("j", ArrowDataType::Int32, false), - ])); + let arrow_schema = nested_arrow_schema(); let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; From 014a5a874963d9a2df72772b8b70193422dbe74e Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 28 Jun 2024 15:05:28 -0700 Subject: [PATCH 05/54] child types need an index too --- kernel/src/engine/arrow_utils.rs | 112 ++++++++++++++++++++++--------- 1 file changed, 79 insertions(+), 33 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 7a0b87adb..5d006910d 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -22,7 +22,10 @@ use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; #[derive(Debug, PartialEq)] pub(crate) enum ReorderIndex { Index(usize), - Child(Vec), + Child { + index: usize, + children: Vec, + }, } /// helper function, does the same as `get_requested_indices` but at an offset. used to recurse into @@ -44,12 +47,12 @@ fn get_indices( //println!("looking at field {}", field.name()); match field.data_type() { ArrowDataType::Struct(fields) => { - if let Some((_index, _, requested_field)) = + if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { match requested_field.data_type { DataType::Struct(ref requested_schema) => { - let (parquet_advance, child_indices) = get_indices( + let (parquet_advance, children) = get_indices( found_fields + parquet_offset, requested_schema.as_ref(), fields, @@ -64,7 +67,7 @@ fn get_indices( // and will count in the `requested_schema.fields.len()` call below found_fields += 1; // push the child reorder on - reorder_indices.push(ReorderIndex::Child(child_indices)); + reorder_indices.push(ReorderIndex::Child { index, children }); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -85,7 +88,7 @@ fn get_indices( array_type.element_type.clone(), array_type.contains_null, )]); - let (parquet_advance, child_indices) = get_indices( + let (parquet_advance, children) = get_indices( found_fields + parquet_offset, &requested_schema, &[list_field.clone()].into(), @@ -98,12 +101,31 @@ fn get_indices( // we have to recurse to find the type, but for reordering a list we // only need a child reordering if the inner type is a struct if let ArrowDataType::Struct(_) = list_field.data_type() { - if child_indices.len() != 1 { - return Err(Error::generic("List call should not have generated more than one reorder index")); + if children.len() != 1 { + return Err( + Error::generic( + "List call should not have generated more than one reorder index" + ) + ); } // safety, checked that we have 1 element - let child_indices = child_indices.into_iter().next().unwrap(); - reorder_indices.push(child_indices); + let mut children = children.into_iter().next().unwrap(); + // the index is wrong though, as it's the index from the inner + // schema. Adjust it to be our index + if let ReorderIndex::Child { + index: ref mut child_index, + .. + } = children + { + *child_index = index; + } else { + return Err( + Error::generic( + "List call should have returned a ReorderIndex::Child variant" + ) + ); + } + reorder_indices.push(children); } else { reorder_indices.push(ReorderIndex::Index(index)); } @@ -157,29 +179,24 @@ pub(crate) fn get_requested_indices( parquet_schema.fields(), &mut mask_indices, )?; - Ok((mask_indices, reorder_indexes)) // FIX ME + Ok((mask_indices, reorder_indexes)) } /// Create a mask that will only select the specified indices from the parquet. Currently we only /// handle "root" level columns, and hence use `ProjectionMask::roots`, but will support leaf /// selection in the future. See issues #86 and #96 as well. pub(crate) fn generate_mask( - requested_schema: &SchemaRef, - parquet_schema: &ArrowSchemaRef, + _requested_schema: &SchemaRef, + _parquet_schema: &ArrowSchemaRef, parquet_physical_schema: &SchemaDescriptor, indices: &[usize], ) -> Option { - if parquet_schema.fields.size() == requested_schema.fields.len() { - // we assume that in get_requested_indices we will have caught any column name mismatches, - // so here we can just say that if we request the same # of columns as the parquet file - // actually has, we don't need to mask anything out - None - } else { - Some(ProjectionMask::roots( - parquet_physical_schema, - indices.to_owned(), - )) - } + // TODO: Determine if it's worth checking if we're selecting everything and returning None in + // that case + Some(ProjectionMask::leaves( + parquet_physical_schema, + indices.to_owned(), + )) } /// Reorder a RecordBatch to match `requested_ordering`. For each non-zero value in @@ -212,7 +229,10 @@ pub(crate) fn reorder_record_batch( mod tests { use std::sync::Arc; - use arrow_schema::{DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema}; + use arrow_schema::{ + DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, + SchemaRef as ArrowSchemaRef, + }; use crate::schema::{ArrayType, DataType, StructField, StructType}; @@ -224,7 +244,7 @@ mod tests { }}; } - fn nested_arrow_schema() -> Arc { + fn nested_arrow_schema() -> ArrowSchemaRef { Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new( @@ -302,7 +322,10 @@ mod tests { let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ rii!(0), - ReorderIndex::Child(vec![rii!(0), rii!(1)]), + ReorderIndex::Child { + index: 1, + children: vec![rii!(0), rii!(1)], + }, rii!(2), ]; assert_eq!(mask_indices, expect_mask); @@ -312,7 +335,6 @@ mod tests { #[test] fn nested_indices_reorder() { let kernel_schema = Arc::new(StructType::new(vec![ - StructField::new("j", DataType::INTEGER, false), StructField::new( "nested", StructType::new(vec![ @@ -321,6 +343,7 @@ mod tests { ]), false, ), + StructField::new("j", DataType::INTEGER, false), StructField::new("i", DataType::INTEGER, false), ])); let arrow_schema = nested_arrow_schema(); @@ -329,8 +352,11 @@ mod tests { let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ rii!(2), - ReorderIndex::Child(vec![rii!(1), rii!(0)]), - rii!(0), + ReorderIndex::Child { + index: 0, + children: vec![rii!(1), rii!(0)], + }, + rii!(1), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -351,7 +377,14 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; - let expect_reorder = vec![rii!(0), ReorderIndex::Child(vec![rii!(0)]), rii!(2)]; + let expect_reorder = vec![ + rii!(0), + ReorderIndex::Child { + index: 1, + children: vec![rii!(0)], + }, + rii!(2), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -426,7 +459,10 @@ mod tests { let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ rii!(0), - ReorderIndex::Child(vec![rii!(0), rii!(1)]), + ReorderIndex::Child { + index: 1, + children: vec![rii!(0), rii!(1)], + }, rii!(2), ]; assert_eq!(mask_indices, expect_mask); @@ -470,7 +506,14 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; - let expect_reorder = vec![rii!(0), ReorderIndex::Child(vec![rii!(0)]), rii!(2)]; + let expect_reorder = vec![ + rii!(0), + ReorderIndex::Child { + index: 1, + children: vec![rii!(0)], + }, + rii!(2), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -518,7 +561,10 @@ mod tests { let expect_mask = vec![0, 2, 3, 4]; let expect_reorder = vec![ rii!(0), - ReorderIndex::Child(vec![rii!(1), rii!(0)]), + ReorderIndex::Child { + index: 1, + children: vec![rii!(1), rii!(0)], + }, rii!(2), ]; assert_eq!(mask_indices, expect_mask); From 3328457fdbf1f58e5b1e753e96fc66dc78331af0 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 28 Jun 2024 15:18:42 -0700 Subject: [PATCH 06/54] reordering is back, but some things broken, no nested reorder yet --- kernel/src/engine/arrow_utils.rs | 42 ++++++++++++++++++++++++---- kernel/src/engine/default/parquet.rs | 4 +-- kernel/src/engine/sync/parquet.rs | 10 +++---- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 5d006910d..fe9c5a4fd 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -28,6 +28,23 @@ pub(crate) enum ReorderIndex { }, } +impl ReorderIndex { + fn index(&self) -> usize { + match self { + ReorderIndex::Index(index) => *index, + ReorderIndex::Child{ index, .. } => *index, + } + } + + /// check if this indexing is ordered. an `Index` variant is ordered by definition + fn is_ordered(&self) -> bool { + match self { + ReorderIndex::Index(_) => true, + ReorderIndex::Child{ index: _ , ref children } => is_ordered(children) + } + } +} + /// helper function, does the same as `get_requested_indices` but at an offset. used to recurse into /// structs. this is called recursively to traverse into structs and lists. `parquet_offset` is how /// many parquet fields exist before processing this potentially nested schema. `reorder_offset` is @@ -199,13 +216,28 @@ pub(crate) fn generate_mask( )) } +/// Check if an ordering is already ordered +fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { + if requested_ordering.len() == 0 { + return true; + } + // we have >=1 element. check that the first element is ordered + if !requested_ordering[0].is_ordered() { + return false; + } + // now check that all elements are ordered wrt. each other, and are internally ordered + requested_ordering.windows(2).all(|ri| { + (ri[0].index() < ri[1].index()) && ri[1].is_ordered() + }) +} + /// Reorder a RecordBatch to match `requested_ordering`. For each non-zero value in /// `requested_ordering`, the column at that index will be added in order to returned batch pub(crate) fn reorder_record_batch( input_data: RecordBatch, - requested_ordering: &[usize], + requested_ordering: &[ReorderIndex], ) -> DeltaResult { - if requested_ordering.windows(2).all(|is| is[0] < is[1]) { + if is_ordered(requested_ordering) { // indices is already sorted, meaning we requested in the order that the columns were // stored in the parquet Ok(input_data) @@ -215,9 +247,9 @@ pub(crate) fn reorder_record_batch( let mut fields = Vec::with_capacity(requested_ordering.len()); let reordered_columns = requested_ordering .iter() - .map(|index| { - fields.push(input_schema.field(*index).clone()); - input_data.column(*index).clone() // cheap Arc clone + .map(|reorder_index| { + fields.push(input_schema.field(reorder_index.index()).clone()); + input_data.column(reorder_index.index()).clone() // cheap Arc clone }) .collect(); let schema = Arc::new(ArrowSchema::new(fields)); diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index 1c20e0bd0..fc66d5126 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -142,7 +142,7 @@ impl FileOpener for ParquetOpener { let stream = stream.map(move |rbr| { // re-order each batch if needed rbr.map_err(Error::Parquet) - //.and_then(|rb| reorder_record_batch(rb, &requested_ordering)) FIX ME + .and_then(|rb| reorder_record_batch(rb, &requested_ordering)) }); Ok(stream.boxed()) })) @@ -205,7 +205,7 @@ impl FileOpener for PresignedUrlOpener { let stream = stream.map(move |rbr| { // re-order each batch if needed rbr.map_err(Error::Arrow) - //.and_then(|rb| reorder_record_batch(rb, &requested_ordering)) FIX ME + .and_then(|rb| reorder_record_batch(rb, &requested_ordering)) }); Ok(stream.boxed()) })) diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index b847e98c0..828dbe104 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -29,12 +29,10 @@ fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult Date: Fri, 28 Jun 2024 16:02:10 -0700 Subject: [PATCH 07/54] checkpoint --- kernel/src/engine/arrow_utils.rs | 73 +++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index fe9c5a4fd..e32607d83 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -21,9 +21,13 @@ use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; /// whose ordering is specified by the values in the associated `Vec` according to these same rules. #[derive(Debug, PartialEq)] pub(crate) enum ReorderIndex { - Index(usize), + Index{ + index: usize, + is_null: bool, + }, Child { index: usize, + is_null: bool, children: Vec, }, } @@ -31,7 +35,7 @@ pub(crate) enum ReorderIndex { impl ReorderIndex { fn index(&self) -> usize { match self { - ReorderIndex::Index(index) => *index, + ReorderIndex::Index{ index, .. } => *index, ReorderIndex::Child{ index, .. } => *index, } } @@ -39,8 +43,8 @@ impl ReorderIndex { /// check if this indexing is ordered. an `Index` variant is ordered by definition fn is_ordered(&self) -> bool { match self { - ReorderIndex::Index(_) => true, - ReorderIndex::Child{ index: _ , ref children } => is_ordered(children) + ReorderIndex::Index{ .. } => true, + ReorderIndex::Child{ ref children, .. } => is_ordered(children) } } } @@ -56,9 +60,9 @@ fn get_indices( fields: &Fields, mask_indices: &mut Vec, ) -> DeltaResult<(usize, Vec)> { - let mut found_fields = 0; + let mut found_fields = Vec::with_capacity(requested_schema.fields.len()); + let mut reorder_indices = Vec::with_capacity(requested_schema.fields.len()); let mut parquet_offset = start_parquet_offset; - let mut reorder_indices = vec![]; //println!("at top with parquet_offset {parquet_offset}"); for (parquet_index, field) in fields.iter().enumerate() { //println!("looking at field {}", field.name()); @@ -70,7 +74,7 @@ fn get_indices( match requested_field.data_type { DataType::Struct(ref requested_schema) => { let (parquet_advance, children) = get_indices( - found_fields + parquet_offset, + found_fields.len() + parquet_offset, requested_schema.as_ref(), fields, mask_indices, @@ -80,11 +84,10 @@ fn get_indices( // an actual index. //println!("here:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; - // also increase found_fields because the struct is a field we've found - // and will count in the `requested_schema.fields.len()` call below - found_fields += 1; + // note that we found this field + found_fields.push(requested_field); // push the child reorder on - reorder_indices.push(ReorderIndex::Child { index, children }); + reorder_indices.push(ReorderIndex::Child { index, is_null: false, children, }); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -106,7 +109,7 @@ fn get_indices( array_type.contains_null, )]); let (parquet_advance, children) = get_indices( - found_fields + parquet_offset, + found_fields.len() + parquet_offset, &requested_schema, &[list_field.clone()].into(), mask_indices, @@ -114,7 +117,7 @@ fn get_indices( // see comment above in struct match arm //println!("here list:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; - found_fields += 1; + found_fields.push(requested_field); // we have to recurse to find the type, but for reordering a list we // only need a child reordering if the inner type is a struct if let ArrowDataType::Struct(_) = list_field.data_type() { @@ -144,7 +147,7 @@ fn get_indices( } reorder_indices.push(children); } else { - reorder_indices.push(ReorderIndex::Index(index)); + reorder_indices.push(ReorderIndex::Index{index, is_null: false}); } } _ => { @@ -154,18 +157,23 @@ fn get_indices( } } _ => { - if let Some(index) = requested_schema.index_of(field.name()) { - found_fields += 1; + if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { + found_fields.push(requested_field); mask_indices.push(parquet_offset + parquet_index); - reorder_indices.push(ReorderIndex::Index(index)); + reorder_indices.push(ReorderIndex::Index{index, is_null: false}); } } } } //println!("found {found_fields}"); //println!("found {found_fields}, requested {}. req schema: {:?}", requested_schema.fields.len(), requested_schema); + if found_fields.len() != requested_schema.fields.len() { + // some fields are missing, but they might be nullable, need to insert them into the reorder_indices + println!("Found {found_fields:?}, but requested {}", requested_schema.fields.len()); + println!("reorder here is: {reorder_indices:?}"); + } require!( - found_fields == requested_schema.fields.len(), + found_fields.len() == requested_schema.fields.len(), Error::generic("Didn't find all requested columns in parquet schema") ); Ok(( @@ -189,6 +197,7 @@ pub(crate) fn get_requested_indices( requested_schema: &SchemaRef, parquet_schema: &ArrowSchemaRef, ) -> DeltaResult<(Vec, Vec)> { + //println!("Called with\n---\n{requested_schema:#?}\n---\n{parquet_schema:#?}"); let mut mask_indices = vec![]; let (_, reorder_indexes) = get_indices( 0, @@ -242,6 +251,7 @@ pub(crate) fn reorder_record_batch( // stored in the parquet Ok(input_data) } else { + println!("ORDERING {requested_ordering:?}"); // requested an order different from the parquet, reorder let input_schema = input_data.schema(); let mut fields = Vec::with_capacity(requested_ordering.len()); @@ -272,7 +282,7 @@ mod tests { macro_rules! rii { ($index: expr) => {{ - ReorderIndex::Index($index) + ReorderIndex::Index{index: $index, is_null: false} }}; } @@ -334,6 +344,25 @@ mod tests { assert_eq!(reorder_indices, expect_reorder); } + #[test] + fn simple_nullable_field_missing() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new("s", DataType::STRING, true), + StructField::new("i2", DataType::INTEGER, true), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("i2", ArrowDataType::Int32, true), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1, 2]; + let expect_reorder = vec![rii!(0), rii!(1), rii!(2)]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + #[test] fn nested_indices() { let kernel_schema = Arc::new(StructType::new(vec![ @@ -356,6 +385,7 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, + is_null: false, children: vec![rii!(0), rii!(1)], }, rii!(2), @@ -386,6 +416,7 @@ mod tests { rii!(2), ReorderIndex::Child { index: 0, + is_null: false, children: vec![rii!(1), rii!(0)], }, rii!(1), @@ -413,6 +444,7 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, + is_null: false, children: vec![rii!(0)], }, rii!(2), @@ -493,6 +525,7 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, + is_null: false, children: vec![rii!(0), rii!(1)], }, rii!(2), @@ -542,6 +575,7 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, + is_null: false, children: vec![rii!(0)], }, rii!(2), @@ -595,6 +629,7 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, + is_null: false, children: vec![rii!(1), rii!(0)], }, rii!(2), From f5627151a96051253595168fa505cc209f426bbe Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 28 Jun 2024 16:54:03 -0700 Subject: [PATCH 08/54] actually fix column ordering --- kernel/src/engine/arrow_utils.rs | 85 +++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index e32607d83..bfba5c8f3 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -8,20 +8,21 @@ use crate::{ DeltaResult, Error, }; -use arrow_array::RecordBatch; +use arrow_array::{Array as ArrowArray, RecordBatch}; use arrow_schema::{ - DataType as ArrowDataType, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, + DataType as ArrowDataType, Field as ArrowField, Fields, Schema as ArrowSchema, + SchemaRef as ArrowSchemaRef, }; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; /// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index -/// represents a column that will be in the read parquet data at that level and index. If the value -/// stored is an `Index` variant, the associated `usize` is the position that the column should -/// appear in the final output. If it is a `Child` variant, then at that index there is a `Struct` -/// whose ordering is specified by the values in the associated `Vec` according to these same rules. +/// represents a column that will be in the read parquet data at that level and index. The `index()` +/// of the element is the position that the column should appear in the final output. If it is a +/// `Child` variant, then at that index there is a `Struct` whose ordering is specified by the +/// values in the associated `Vec` according to these same rules. #[derive(Debug, PartialEq)] pub(crate) enum ReorderIndex { - Index{ + Index { index: usize, is_null: bool, }, @@ -35,16 +36,16 @@ pub(crate) enum ReorderIndex { impl ReorderIndex { fn index(&self) -> usize { match self { - ReorderIndex::Index{ index, .. } => *index, - ReorderIndex::Child{ index, .. } => *index, + ReorderIndex::Index { index, .. } => *index, + ReorderIndex::Child { index, .. } => *index, } } /// check if this indexing is ordered. an `Index` variant is ordered by definition fn is_ordered(&self) -> bool { match self { - ReorderIndex::Index{ .. } => true, - ReorderIndex::Child{ ref children, .. } => is_ordered(children) + ReorderIndex::Index { .. } => true, + ReorderIndex::Child { ref children, .. } => is_ordered(children), } } } @@ -87,7 +88,11 @@ fn get_indices( // note that we found this field found_fields.push(requested_field); // push the child reorder on - reorder_indices.push(ReorderIndex::Child { index, is_null: false, children, }); + reorder_indices.push(ReorderIndex::Child { + index, + is_null: false, + children, + }); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -147,7 +152,10 @@ fn get_indices( } reorder_indices.push(children); } else { - reorder_indices.push(ReorderIndex::Index{index, is_null: false}); + reorder_indices.push(ReorderIndex::Index { + index, + is_null: false, + }); } } _ => { @@ -157,10 +165,15 @@ fn get_indices( } } _ => { - if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { + if let Some((index, _, requested_field)) = + requested_schema.fields.get_full(field.name()) + { found_fields.push(requested_field); mask_indices.push(parquet_offset + parquet_index); - reorder_indices.push(ReorderIndex::Index{index, is_null: false}); + reorder_indices.push(ReorderIndex::Index { + index, + is_null: false, + }); } } } @@ -169,7 +182,10 @@ fn get_indices( //println!("found {found_fields}, requested {}. req schema: {:?}", requested_schema.fields.len(), requested_schema); if found_fields.len() != requested_schema.fields.len() { // some fields are missing, but they might be nullable, need to insert them into the reorder_indices - println!("Found {found_fields:?}, but requested {}", requested_schema.fields.len()); + println!( + "Found {found_fields:?}, but requested {}", + requested_schema.fields.len() + ); println!("reorder here is: {reorder_indices:?}"); } require!( @@ -235,9 +251,9 @@ fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { return false; } // now check that all elements are ordered wrt. each other, and are internally ordered - requested_ordering.windows(2).all(|ri| { - (ri[0].index() < ri[1].index()) && ri[1].is_ordered() - }) + requested_ordering + .windows(2) + .all(|ri| (ri[0].index() < ri[1].index()) && ri[1].is_ordered()) } /// Reorder a RecordBatch to match `requested_ordering`. For each non-zero value in @@ -251,18 +267,26 @@ pub(crate) fn reorder_record_batch( // stored in the parquet Ok(input_data) } else { - println!("ORDERING {requested_ordering:?}"); + //println!("ORDERING {requested_ordering:?}"); // requested an order different from the parquet, reorder let input_schema = input_data.schema(); - let mut fields = Vec::with_capacity(requested_ordering.len()); - let reordered_columns = requested_ordering + let mut final_fields_cols: Vec, Arc)>> = + std::iter::repeat_with(|| None) + .take(requested_ordering.len()) + .collect(); + for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { + // for each item, reorder_index.index() tells us where to put it, and its position in + // requested_ordering tells us where it is in the parquet data + final_fields_cols[reorder_index.index()] = Some(( + Arc::new(input_schema.field(parquet_position).clone()), + input_data.column(parquet_position).clone(), // cheap Arc clone + )); + } + let field_iter = final_fields_cols .iter() - .map(|reorder_index| { - fields.push(input_schema.field(reorder_index.index()).clone()); - input_data.column(reorder_index.index()).clone() // cheap Arc clone - }) - .collect(); - let schema = Arc::new(ArrowSchema::new(fields)); + .map(|fco| fco.as_ref().unwrap().0.clone()); + let schema = Arc::new(ArrowSchema::new(Fields::from_iter(field_iter))); + let reordered_columns = final_fields_cols.into_iter().map(|fco| fco.unwrap().1).collect(); Ok(RecordBatch::try_new(schema, reordered_columns)?) } } @@ -282,7 +306,10 @@ mod tests { macro_rules! rii { ($index: expr) => {{ - ReorderIndex::Index{index: $index, is_null: false} + ReorderIndex::Index { + index: $index, + is_null: false, + } }}; } From 4134e9aade402abf527117a44f2258f84a3c4bce Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 1 Jul 2024 12:31:13 -0700 Subject: [PATCH 09/54] checkpoint, properly skipping inner structs --- kernel/src/engine/arrow_utils.rs | 141 ++++++++++++++++++++++++++----- 1 file changed, 121 insertions(+), 20 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index bfba5c8f3..38c94f10d 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -1,10 +1,9 @@ //! Some utilities for working with arrow data types -use std::sync::Arc; +use std::{collections::HashSet, sync::Arc}; use crate::{ schema::{DataType, Schema, SchemaRef, StructField, StructType}, - utils::require, DeltaResult, Error, }; @@ -14,6 +13,7 @@ use arrow_schema::{ SchemaRef as ArrowSchemaRef, }; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; +use tracing::debug; /// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index /// represents a column that will be in the read parquet data at that level and index. The `index()` @@ -50,6 +50,24 @@ impl ReorderIndex { } } +// count the number of physical columns, including nested ones in an `ArrowField` +fn count_cols(field: &ArrowField) -> usize { + _count_cols(field.data_type()) +} + +fn _count_cols(dt: &ArrowDataType) -> usize { + match dt { + ArrowDataType::Struct(fields) => fields.iter().fold(0, |acc, f| acc + count_cols(f)), + ArrowDataType::Union(fields, _) => fields.iter().fold(0, |acc, (_, f)| acc + count_cols(f)), + ArrowDataType::List(field) + | ArrowDataType::LargeList(field) + | ArrowDataType::FixedSizeList(field, _) + | ArrowDataType::Map(field, _) => count_cols(field), + ArrowDataType::Dictionary(_, value_field) => _count_cols(value_field.as_ref()), + _ => 1, // other types are "real" fields, so count + } +} + /// helper function, does the same as `get_requested_indices` but at an offset. used to recurse into /// structs. this is called recursively to traverse into structs and lists. `parquet_offset` is how /// many parquet fields exist before processing this potentially nested schema. `reorder_offset` is @@ -61,12 +79,12 @@ fn get_indices( fields: &Fields, mask_indices: &mut Vec, ) -> DeltaResult<(usize, Vec)> { - let mut found_fields = Vec::with_capacity(requested_schema.fields.len()); + let mut found_fields = HashSet::with_capacity(requested_schema.fields.len()); let mut reorder_indices = Vec::with_capacity(requested_schema.fields.len()); let mut parquet_offset = start_parquet_offset; //println!("at top with parquet_offset {parquet_offset}"); for (parquet_index, field) in fields.iter().enumerate() { - //println!("looking at field {}", field.name()); + debug!("Getting indices for field {} with offset {parquet_offset}, with index {parquet_index}", field.name()); match field.data_type() { ArrowDataType::Struct(fields) => { if let Some((index, _, requested_field)) = @@ -75,7 +93,7 @@ fn get_indices( match requested_field.data_type { DataType::Struct(ref requested_schema) => { let (parquet_advance, children) = get_indices( - found_fields.len() + parquet_offset, + parquet_index + parquet_offset, requested_schema.as_ref(), fields, mask_indices, @@ -86,7 +104,7 @@ fn get_indices( //println!("here:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; // note that we found this field - found_fields.push(requested_field); + found_fields.insert(requested_field.name()); // push the child reorder on reorder_indices.push(ReorderIndex::Child { index, @@ -98,6 +116,12 @@ fn get_indices( return Err(Error::unexpected_column_type(field.name())); } } + } else { + // We're NOT selecting this field, but we still need to update how much we skip + debug!("Skipping over un-selected struct: {}", field.name()); + // offset by number of inner fields. subtract one, because the enumerate still + // counts this field + parquet_offset += count_cols(field) - 1; } } ArrowDataType::List(list_field) | ArrowDataType::ListView(list_field) => { @@ -122,7 +146,7 @@ fn get_indices( // see comment above in struct match arm //println!("here list:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; - found_fields.push(requested_field); + found_fields.insert(requested_field.name()); // we have to recurse to find the type, but for reordering a list we // only need a child reordering if the inner type is a struct if let ArrowDataType::Struct(_) = list_field.data_type() { @@ -168,7 +192,7 @@ fn get_indices( if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { - found_fields.push(requested_field); + found_fields.insert(requested_field.name()); mask_indices.push(parquet_offset + parquet_index); reorder_indices.push(ReorderIndex::Index { index, @@ -182,16 +206,24 @@ fn get_indices( //println!("found {found_fields}, requested {}. req schema: {:?}", requested_schema.fields.len(), requested_schema); if found_fields.len() != requested_schema.fields.len() { // some fields are missing, but they might be nullable, need to insert them into the reorder_indices - println!( - "Found {found_fields:?}, but requested {}", - requested_schema.fields.len() - ); - println!("reorder here is: {reorder_indices:?}"); + for (requested_position, field) in requested_schema.fields().enumerate() { + if !found_fields.contains(field.name()) { + if field.nullable { + println!("Inserting missing and nullable field: {}", field.name()); + reorder_indices.push(ReorderIndex::Index{ index: requested_position, is_null: true}); + } else { + return Err(Error::Generic(format!( + "Requested field not found in parquet schema, and field is not nullable: {}", + field.name() + ))); + } + } + } } - require!( - found_fields.len() == requested_schema.fields.len(), - Error::generic("Didn't find all requested columns in parquet schema") - ); + // require!( + // found_fields.len() == requested_schema.fields.len(), + // Error::generic("Didn't find all requested columns in parquet schema") + // ); Ok(( parquet_offset + fields.len() - start_parquet_offset, reorder_indices, @@ -221,6 +253,8 @@ pub(crate) fn get_requested_indices( parquet_schema.fields(), &mut mask_indices, )?; + println!("parquet_schema: {parquet_schema:#?}"); + println!("mask {mask_indices:?}"); Ok((mask_indices, reorder_indexes)) } @@ -267,9 +301,10 @@ pub(crate) fn reorder_record_batch( // stored in the parquet Ok(input_data) } else { - //println!("ORDERING {requested_ordering:?}"); + println!("ORDERING {requested_ordering:#?}, rows {}", input_data.num_rows()); // requested an order different from the parquet, reorder let input_schema = input_data.schema(); + println!("data: {input_data:#?}"); let mut final_fields_cols: Vec, Arc)>> = std::iter::repeat_with(|| None) .take(requested_ordering.len()) @@ -277,6 +312,7 @@ pub(crate) fn reorder_record_batch( for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data + println!("{parquet_position} in {input_schema:?}"); final_fields_cols[reorder_index.index()] = Some(( Arc::new(input_schema.field(parquet_position).clone()), input_data.column(parquet_position).clone(), // cheap Arc clone @@ -313,6 +349,15 @@ mod tests { }}; } + macro_rules! rii_null { + ($index: expr) => {{ + ReorderIndex::Index { + index: $index, + is_null: true, + } + }}; + } + fn nested_arrow_schema() -> ArrowSchemaRef { Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), @@ -384,8 +429,8 @@ mod tests { ])); let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); - let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![rii!(0), rii!(1), rii!(2)]; + let expect_mask = vec![0, 1]; + let expect_reorder = vec![rii!(0), rii!(2), rii_null!(1)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -664,4 +709,60 @@ mod tests { assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } + + #[test] + fn skipped_struct() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new( + "nested", + StructType::new(vec![ + StructField::new("int32", DataType::INTEGER, false), + StructField::new("string", DataType::STRING, false), + ]), + false, + ), + StructField::new("j", DataType::INTEGER, false), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new( + "skipped", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ), + ArrowField::new("i", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![2, 3, 4, 5]; + let expect_reorder = vec![ + rii!(2), + ReorderIndex::Child { + index: 1, + is_null: false, + children: vec![rii!(0), rii!(1)], + }, + rii!(0), + ]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } } From b5f5983a49c666c8ded0b1d891991c14711dd775 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 1 Jul 2024 15:13:08 -0700 Subject: [PATCH 10/54] working! other than re-ordering children --- kernel/src/engine/arrow_utils.rs | 155 +++++++++++++++++++-------- kernel/src/engine/default/parquet.rs | 12 ++- kernel/src/engine/sync/parquet.rs | 8 +- kernel/src/schema.rs | 8 ++ 4 files changed, 129 insertions(+), 54 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 38c94f10d..490516745 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -7,10 +7,9 @@ use crate::{ DeltaResult, Error, }; -use arrow_array::{Array as ArrowArray, RecordBatch}; +use arrow_array::{Array as ArrowArray, StructArray}; use arrow_schema::{ - DataType as ArrowDataType, Field as ArrowField, Fields, Schema as ArrowSchema, - SchemaRef as ArrowSchemaRef, + DataType as ArrowDataType, Field as ArrowField, Fields, SchemaRef as ArrowSchemaRef, }; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; use tracing::debug; @@ -60,9 +59,9 @@ fn _count_cols(dt: &ArrowDataType) -> usize { ArrowDataType::Struct(fields) => fields.iter().fold(0, |acc, f| acc + count_cols(f)), ArrowDataType::Union(fields, _) => fields.iter().fold(0, |acc, (_, f)| acc + count_cols(f)), ArrowDataType::List(field) - | ArrowDataType::LargeList(field) - | ArrowDataType::FixedSizeList(field, _) - | ArrowDataType::Map(field, _) => count_cols(field), + | ArrowDataType::LargeList(field) + | ArrowDataType::FixedSizeList(field, _) + | ArrowDataType::Map(field, _) => count_cols(field), ArrowDataType::Dictionary(_, value_field) => _count_cols(value_field.as_ref()), _ => 1, // other types are "real" fields, so count } @@ -82,9 +81,11 @@ fn get_indices( let mut found_fields = HashSet::with_capacity(requested_schema.fields.len()); let mut reorder_indices = Vec::with_capacity(requested_schema.fields.len()); let mut parquet_offset = start_parquet_offset; - //println!("at top with parquet_offset {parquet_offset}"); for (parquet_index, field) in fields.iter().enumerate() { - debug!("Getting indices for field {} with offset {parquet_offset}, with index {parquet_index}", field.name()); + debug!( + "Getting indices for field {} with offset {parquet_offset}, with index {parquet_index}", + field.name() + ); match field.data_type() { ArrowDataType::Struct(fields) => { if let Some((index, _, requested_field)) = @@ -101,7 +102,6 @@ fn get_indices( // advance the number of parquet fields, but subtract 1 because the // struct will be counted by the `enumerate` call but doesn't count as // an actual index. - //println!("here:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; // note that we found this field found_fields.insert(requested_field.name()); @@ -121,7 +121,7 @@ fn get_indices( debug!("Skipping over un-selected struct: {}", field.name()); // offset by number of inner fields. subtract one, because the enumerate still // counts this field - parquet_offset += count_cols(field) - 1; + parquet_offset += count_cols(field) - 1; } } ArrowDataType::List(list_field) | ArrowDataType::ListView(list_field) => { @@ -144,7 +144,6 @@ fn get_indices( mask_indices, )?; // see comment above in struct match arm - //println!("here list:\n parquet_offset: {parquet_offset}\n parquet_advance: {parquet_advance}"); parquet_offset += parquet_advance - 1; found_fields.insert(requested_field.name()); // we have to recurse to find the type, but for reordering a list we @@ -188,6 +187,52 @@ fn get_indices( } } } + ArrowDataType::Map(key_val_field, _) => { + if let Some((index, _, requested_field)) = + requested_schema.fields.get_full(field.name()) + { + match (key_val_field.data_type(), requested_field.data_type()) { + (ArrowDataType::Struct(inner_fields), DataType::Map(map_type)) => { + let mut key_val_names = + inner_fields.iter().map(|f| f.name().to_string()); + let key_name = if let Some(key_name) = key_val_names.next() { + key_name + } else { + return Err(Error::generic("map fields didn't include a key col")); + }; + let val_name = if let Some(val_name) = key_val_names.next() { + val_name + } else { + return Err(Error::generic("map fields didn't include a val col")); + }; + if key_val_names.next().is_some() { + return Err(Error::generic("map fields had more than 2 members")); + } + let inner_schema = map_type.as_struct_schema(key_name, val_name); + let (parquet_advance, _children) = get_indices( + parquet_index + parquet_offset, + &inner_schema, + inner_fields, + mask_indices, + )?; + // advance the number of parquet fields, but subtract 1 because the + // map will be counted by the `enumerate` call but doesn't count as + // an actual index. + parquet_offset += parquet_advance - 1; + // note that we found this field + found_fields.insert(requested_field.name()); + // push the child reorder on, currently no reordering for maps + reorder_indices.push(ReorderIndex::Index { + index, + is_null: false, + }); + } + _ => { + return Err(Error::unexpected_column_type(field.name())); + } + } + } + } _ => { if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) @@ -202,15 +247,16 @@ fn get_indices( } } } - //println!("found {found_fields}"); - //println!("found {found_fields}, requested {}. req schema: {:?}", requested_schema.fields.len(), requested_schema); if found_fields.len() != requested_schema.fields.len() { // some fields are missing, but they might be nullable, need to insert them into the reorder_indices for (requested_position, field) in requested_schema.fields().enumerate() { if !found_fields.contains(field.name()) { if field.nullable { - println!("Inserting missing and nullable field: {}", field.name()); - reorder_indices.push(ReorderIndex::Index{ index: requested_position, is_null: true}); + debug!("Inserting missing and nullable field: {}", field.name()); + reorder_indices.push(ReorderIndex::Index { + index: requested_position, + is_null: true, + }); } else { return Err(Error::Generic(format!( "Requested field not found in parquet schema, and field is not nullable: {}", @@ -220,10 +266,6 @@ fn get_indices( } } } - // require!( - // found_fields.len() == requested_schema.fields.len(), - // Error::generic("Didn't find all requested columns in parquet schema") - // ); Ok(( parquet_offset + fields.len() - start_parquet_offset, reorder_indices, @@ -245,7 +287,6 @@ pub(crate) fn get_requested_indices( requested_schema: &SchemaRef, parquet_schema: &ArrowSchemaRef, ) -> DeltaResult<(Vec, Vec)> { - //println!("Called with\n---\n{requested_schema:#?}\n---\n{parquet_schema:#?}"); let mut mask_indices = vec![]; let (_, reorder_indexes) = get_indices( 0, @@ -253,8 +294,6 @@ pub(crate) fn get_requested_indices( parquet_schema.fields(), &mut mask_indices, )?; - println!("parquet_schema: {parquet_schema:#?}"); - println!("mask {mask_indices:?}"); Ok((mask_indices, reorder_indexes)) } @@ -277,7 +316,7 @@ pub(crate) fn generate_mask( /// Check if an ordering is already ordered fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { - if requested_ordering.len() == 0 { + if requested_ordering.is_empty() { return true; } // we have >=1 element. check that the first element is ordered @@ -290,40 +329,68 @@ fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { .all(|ri| (ri[0].index() < ri[1].index()) && ri[1].is_ordered()) } +// we use this as a placeholder for an array and its associated field. We can fill in a Vec of None +// of this type and then set elements of the Vec to Some(FieldArrayOpt) for each column +type FieldArrayOpt = Option<(Arc, Arc)>; + /// Reorder a RecordBatch to match `requested_ordering`. For each non-zero value in /// `requested_ordering`, the column at that index will be added in order to returned batch -pub(crate) fn reorder_record_batch( - input_data: RecordBatch, +pub(crate) fn reorder_struct_array( + input_data: StructArray, requested_ordering: &[ReorderIndex], -) -> DeltaResult { +) -> DeltaResult { if is_ordered(requested_ordering) { // indices is already sorted, meaning we requested in the order that the columns were // stored in the parquet Ok(input_data) } else { - println!("ORDERING {requested_ordering:#?}, rows {}", input_data.num_rows()); // requested an order different from the parquet, reorder - let input_schema = input_data.schema(); - println!("data: {input_data:#?}"); - let mut final_fields_cols: Vec, Arc)>> = - std::iter::repeat_with(|| None) - .take(requested_ordering.len()) - .collect(); + debug!("Have requested reorder {requested_ordering:#?} on {input_data:?}"); + let (input_fields, input_cols, null_buffer) = input_data.into_parts(); + let mut final_fields_cols: Vec = std::iter::repeat_with(|| None) + .take(requested_ordering.len()) + .collect(); for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data - println!("{parquet_position} in {input_schema:?}"); - final_fields_cols[reorder_index.index()] = Some(( - Arc::new(input_schema.field(parquet_position).clone()), - input_data.column(parquet_position).clone(), // cheap Arc clone - )); + match reorder_index { + ReorderIndex::Child { + index, + is_null: _is_null, + children: _children, + } => { + // TODO: This turns out to be *hard*. You cannot (easily) get a owned copy of + // the column without cloning, but we need that to be able to sort internally. + final_fields_cols[*index] = Some(( + input_fields[parquet_position].clone(), // cheap Arc clone + input_cols[parquet_position].clone(), // cheap Arc clone + )); + } + ReorderIndex::Index { + index, + is_null: _is_null, + } => { + final_fields_cols[*index] = Some(( + input_fields[parquet_position].clone(), // cheap Arc clone + input_cols[parquet_position].clone(), // cheap Arc clone + )); + } + } } - let field_iter = final_fields_cols - .iter() - .map(|fco| fco.as_ref().unwrap().0.clone()); - let schema = Arc::new(ArrowSchema::new(Fields::from_iter(field_iter))); - let reordered_columns = final_fields_cols.into_iter().map(|fco| fco.unwrap().1).collect(); - Ok(RecordBatch::try_new(schema, reordered_columns)?) + let fields = Fields::from_iter( + final_fields_cols + .iter() + .map(|fco| fco.as_ref().unwrap().0.clone()), // cheap Arc clone + ); + let reordered_columns = final_fields_cols + .into_iter() + .map(|fco| fco.unwrap().1) + .collect(); + Ok(StructArray::try_new( + fields, + reordered_columns, + null_buffer, + )?) } } diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index fc66d5126..d8240dd8b 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -12,7 +12,7 @@ use parquet::arrow::arrow_reader::{ use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; -use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_record_batch}; +use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; use crate::engine::default::executor::TaskExecutor; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; @@ -141,8 +141,9 @@ impl FileOpener for ParquetOpener { let stream = stream.map(move |rbr| { // re-order each batch if needed - rbr.map_err(Error::Parquet) - .and_then(|rb| reorder_record_batch(rb, &requested_ordering)) + rbr.map_err(Error::Parquet).and_then(|rb| { + reorder_struct_array(rb.into(), &requested_ordering).map(|sa| sa.into()) + }) }); Ok(stream.boxed()) })) @@ -204,8 +205,9 @@ impl FileOpener for PresignedUrlOpener { let stream = futures::stream::iter(reader); let stream = stream.map(move |rbr| { // re-order each batch if needed - rbr.map_err(Error::Arrow) - .and_then(|rb| reorder_record_batch(rb, &requested_ordering)) + rbr.map_err(Error::Arrow).and_then(|rb| { + reorder_struct_array(rb.into(), &requested_ordering).map(|sa| sa.into()) + }) }); Ok(stream.boxed()) })) diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 828dbe104..01a490bc5 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -5,7 +5,7 @@ use tracing::debug; use url::Url; use crate::engine::arrow_data::ArrowEngineData; -use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_record_batch}; +use crate::engine::arrow_utils::{generate_mask, get_requested_indices, reorder_struct_array}; use crate::schema::SchemaRef; use crate::{DeltaResult, Error, Expression, FileDataReadResultIterator, FileMeta, ParquetHandler}; @@ -29,10 +29,8 @@ fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult bool { self.value_contains_null } + + /// Create a schema assuming the map is stored as a struct with the specified key and value field names + pub fn as_struct_schema(&self, key_name: String, val_name: String) -> Schema { + StructType::new(vec![ + StructField::new(key_name, self.key_type.clone(), false), + StructField::new(val_name, self.value_type.clone(), self.value_contains_null), + ]) + } } fn default_true() -> bool { From 1d72be8a31ddabc85ce2986852b7a12935550fc6 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 1 Jul 2024 15:24:29 -0700 Subject: [PATCH 11/54] comment fixup --- kernel/src/engine/arrow_utils.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 490516745..6669a9b9d 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -68,10 +68,9 @@ fn _count_cols(dt: &ArrowDataType) -> usize { } /// helper function, does the same as `get_requested_indices` but at an offset. used to recurse into -/// structs. this is called recursively to traverse into structs and lists. `parquet_offset` is how -/// many parquet fields exist before processing this potentially nested schema. `reorder_offset` is -/// how many fields we've found so far before processing at this nested schema. returns the number -/// of parquet fields and the number of requested fields processed +/// structs, lists, and maps. `parquet_offset` is how many parquet fields exist before processing +/// this potentially nested schema. returns the number of parquet fields in `fields` (regardless of +/// if they are selected or not) and reordering information for the requested fields. fn get_indices( start_parquet_offset: usize, requested_schema: &Schema, @@ -272,17 +271,13 @@ fn get_indices( )) } -/// Get the indices in `parquet_schema` of the specified columns in `requested_schema`. This -/// returns a tuples of (mask_indices: Vec, reorder_indices: +/// Get the indices in `parquet_schema` of the specified columns in `requested_schema`. This returns +/// a tuples of (mask_indices: Vec, reorder_indices: /// Vec). `mask_indices` is used for generating the mask for reading from the /// parquet file, and simply contains an entry for each index we wish to select from the parquet /// file set to the index of the requested column in the parquet. `reorder_indices` is used for -/// re-ordering and will be the same size as `requested_schema`. Each index in `reorder_indices` -/// represents a column that will be in the read parquet data at that index. The value stored in -/// `reorder_indices` is the position that the column should appear in the final output. For -/// example, if `reorder_indices` is `[2,0,1]`, then the re-ordering code should take the third -/// column in the raw-read parquet data, and move it to the first column in the final output, the -/// first column to the second, and the second to the third. +/// re-ordering. See the documentation for [`ReorderIndex`] to understand what each element in the +/// returned array means pub(crate) fn get_requested_indices( requested_schema: &SchemaRef, parquet_schema: &ArrowSchemaRef, From 9cac2c5f4e9a0c03a39664e611db41e784f3b054 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 1 Jul 2024 18:03:53 -0700 Subject: [PATCH 12/54] actually reorder children --- kernel/src/engine/arrow_utils.rs | 200 +++++++++++++++++++++---------- 1 file changed, 135 insertions(+), 65 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 6669a9b9d..994af96d2 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -7,9 +7,10 @@ use crate::{ DeltaResult, Error, }; -use arrow_array::{Array as ArrowArray, StructArray}; +use arrow_array::{new_null_array, Array as ArrowArray, StructArray}; use arrow_schema::{ - DataType as ArrowDataType, Field as ArrowField, Fields, SchemaRef as ArrowSchemaRef, + DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields, + SchemaRef as ArrowSchemaRef, }; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; use tracing::debug; @@ -21,30 +22,36 @@ use tracing::debug; /// values in the associated `Vec` according to these same rules. #[derive(Debug, PartialEq)] pub(crate) enum ReorderIndex { + Child { + index: usize, + children: Vec, + }, Index { index: usize, - is_null: bool, }, - Child { + Null { index: usize, - is_null: bool, - children: Vec, + field: ArrowFieldRef, }, } impl ReorderIndex { fn index(&self) -> usize { match self { - ReorderIndex::Index { index, .. } => *index, ReorderIndex::Child { index, .. } => *index, + ReorderIndex::Index { index, .. } => *index, + ReorderIndex::Null { index, .. } => *index, } } - /// check if this indexing is ordered. an `Index` variant is ordered by definition + /// check if this indexing is ordered. an `Index` variant is ordered by definition. a `Null` + /// variant is not because if we have a `Null` variant we need to do work in + /// reorder_struct_array to insert the null col fn is_ordered(&self) -> bool { match self { - ReorderIndex::Index { .. } => true, ReorderIndex::Child { ref children, .. } => is_ordered(children), + ReorderIndex::Index { .. } => true, + ReorderIndex::Null { .. } => false, } } } @@ -105,11 +112,7 @@ fn get_indices( // note that we found this field found_fields.insert(requested_field.name()); // push the child reorder on - reorder_indices.push(ReorderIndex::Child { - index, - is_null: false, - children, - }); + reorder_indices.push(ReorderIndex::Child { index, children }); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -174,10 +177,7 @@ fn get_indices( } reorder_indices.push(children); } else { - reorder_indices.push(ReorderIndex::Index { - index, - is_null: false, - }); + reorder_indices.push(ReorderIndex::Index { index }); } } _ => { @@ -221,10 +221,7 @@ fn get_indices( // note that we found this field found_fields.insert(requested_field.name()); // push the child reorder on, currently no reordering for maps - reorder_indices.push(ReorderIndex::Index { - index, - is_null: false, - }); + reorder_indices.push(ReorderIndex::Index { index }); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -238,10 +235,7 @@ fn get_indices( { found_fields.insert(requested_field.name()); mask_indices.push(parquet_offset + parquet_index); - reorder_indices.push(ReorderIndex::Index { - index, - is_null: false, - }); + reorder_indices.push(ReorderIndex::Index { index }); } } } @@ -252,9 +246,9 @@ fn get_indices( if !found_fields.contains(field.name()) { if field.nullable { debug!("Inserting missing and nullable field: {}", field.name()); - reorder_indices.push(ReorderIndex::Index { + reorder_indices.push(ReorderIndex::Null { index: requested_position, - is_null: true, + field: Arc::new(field.try_into()?), }); } else { return Err(Error::Generic(format!( @@ -341,7 +335,8 @@ pub(crate) fn reorder_struct_array( } else { // requested an order different from the parquet, reorder debug!("Have requested reorder {requested_ordering:#?} on {input_data:?}"); - let (input_fields, input_cols, null_buffer) = input_data.into_parts(); + let num_rows = input_data.len(); + let (input_fields, mut input_cols, null_buffer) = input_data.into_parts(); let mut final_fields_cols: Vec = std::iter::repeat_with(|| None) .take(requested_ordering.len()) .collect(); @@ -349,27 +344,37 @@ pub(crate) fn reorder_struct_array( // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data match reorder_index { - ReorderIndex::Child { - index, - is_null: _is_null, - children: _children, - } => { - // TODO: This turns out to be *hard*. You cannot (easily) get a owned copy of - // the column without cloning, but we need that to be able to sort internally. + ReorderIndex::Child { index, children } => { + let mut placeholder: Arc = + Arc::new(StructArray::new_empty_fields(0, None)); + std::mem::swap(&mut input_cols[parquet_position], &mut placeholder); + // placeholder now holds our struct array that we want to reorder + let struct_array: StructArray = placeholder.into_data().into(); + let result_array = reorder_struct_array(struct_array, children)?; + // create the new field specifying the correct order for the struct + let new_field = Arc::new(ArrowField::new_struct( + input_fields[parquet_position].name(), + result_array.fields().clone(), + input_fields[parquet_position].is_nullable(), + )); + let mut sa: Arc = Arc::new(result_array); + std::mem::swap(&mut input_cols[parquet_position], &mut sa); final_fields_cols[*index] = Some(( - input_fields[parquet_position].clone(), // cheap Arc clone - input_cols[parquet_position].clone(), // cheap Arc clone + new_field, + input_cols[parquet_position].clone(), // cheap Arc clone )); } - ReorderIndex::Index { - index, - is_null: _is_null, - } => { + ReorderIndex::Index { index } => { final_fields_cols[*index] = Some(( input_fields[parquet_position].clone(), // cheap Arc clone input_cols[parquet_position].clone(), // cheap Arc clone )); } + ReorderIndex::Null { index, field } => { + let null_arry = Arc::new(new_null_array(field.data_type(), num_rows)); + let field = field.clone(); // cheap Arc clone + final_fields_cols[*index] = Some((field, null_arry)); + } } } let fields = Fields::from_iter( @@ -393,30 +398,20 @@ pub(crate) fn reorder_struct_array( mod tests { use std::sync::Arc; + use arrow::array::AsArray; + use arrow_array::{ArrayRef as ArrowArrayRef, BooleanArray, Int32Array, StructArray}; use arrow_schema::{ - DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, + DataType as ArrowDataType, Field as ArrowField, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; use crate::schema::{ArrayType, DataType, StructField, StructType}; - use super::{get_requested_indices, ReorderIndex}; + use super::{get_requested_indices, reorder_struct_array, ReorderIndex}; macro_rules! rii { ($index: expr) => {{ - ReorderIndex::Index { - index: $index, - is_null: false, - } - }}; - } - - macro_rules! rii_null { - ($index: expr) => {{ - ReorderIndex::Index { - index: $index, - is_null: true, - } + ReorderIndex::Index { index: $index } }}; } @@ -492,7 +487,14 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1]; - let expect_reorder = vec![rii!(0), rii!(2), rii_null!(1)]; + let expect_reorder = vec![ + rii!(0), + rii!(2), + ReorderIndex::Null { + index: 1, + field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), + }, + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -519,7 +521,6 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, - is_null: false, children: vec![rii!(0), rii!(1)], }, rii!(2), @@ -550,7 +551,6 @@ mod tests { rii!(2), ReorderIndex::Child { index: 0, - is_null: false, children: vec![rii!(1), rii!(0)], }, rii!(1), @@ -578,7 +578,6 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, - is_null: false, children: vec![rii!(0)], }, rii!(2), @@ -659,7 +658,6 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, - is_null: false, children: vec![rii!(0), rii!(1)], }, rii!(2), @@ -709,7 +707,6 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, - is_null: false, children: vec![rii!(0)], }, rii!(2), @@ -763,7 +760,6 @@ mod tests { rii!(0), ReorderIndex::Child { index: 1, - is_null: false, children: vec![rii!(1), rii!(0)], }, rii!(2), @@ -819,7 +815,6 @@ mod tests { rii!(2), ReorderIndex::Child { index: 1, - is_null: false, children: vec![rii!(0), rii!(1)], }, rii!(0), @@ -827,4 +822,79 @@ mod tests { assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } + + fn make_struct_array() -> StructArray { + let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); + let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); + StructArray::from(vec![ + ( + Arc::new(ArrowField::new("b", ArrowDataType::Boolean, false)), + boolean.clone() as ArrowArrayRef, + ), + ( + Arc::new(ArrowField::new("c", ArrowDataType::Int32, false)), + int.clone() as ArrowArrayRef, + ), + ]) + } + + #[test] + fn simple_reorder_struct() { + let arry = make_struct_array(); + let reorder = vec![rii!(1), rii!(0)]; + let ordered = reorder_struct_array(arry, &reorder).unwrap(); + assert_eq!(ordered.column_names(), vec!["c", "b"]); + } + + #[test] + fn nested_reorder_struct() { + let arry1 = Arc::new(make_struct_array()); + let arry2 = Arc::new(make_struct_array()); + let fields: Fields = vec![ + Arc::new(ArrowField::new("b", ArrowDataType::Boolean, false)), + Arc::new(ArrowField::new("c", ArrowDataType::Int32, false)), + ] + .into(); + let nested = StructArray::from(vec![ + ( + Arc::new(ArrowField::new( + "struct1", + ArrowDataType::Struct(fields.clone()), + false, + )), + arry1 as ArrowArrayRef, + ), + ( + Arc::new(ArrowField::new( + "struct2", + ArrowDataType::Struct(fields), + false, + )), + arry2 as ArrowArrayRef, + ), + ]); + let reorder = vec![ + ReorderIndex::Child { + index: 1, + children: vec![rii!(1), rii!(0)], + }, + ReorderIndex::Child { + index: 0, + children: vec![ + rii!(0), + rii!(1), + ReorderIndex::Null { + index: 2, + field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), + }, + ], + }, + ]; + let ordered = reorder_struct_array(nested, &reorder).unwrap(); + assert_eq!(ordered.column_names(), vec!["struct2", "struct1"]); + let ordered_s2 = ordered.column(0).as_struct(); + assert_eq!(ordered_s2.column_names(), vec!["b", "c", "s"]); + let ordered_s1 = ordered.column(1).as_struct(); + assert_eq!(ordered_s1.column_names(), vec!["c", "b"]); + } } From 83836cbcdedc87134995b963282a1d0a146a00dc Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 1 Jul 2024 18:11:00 -0700 Subject: [PATCH 13/54] no need to swap back --- kernel/src/engine/arrow_utils.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 994af96d2..62fee72e5 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -357,12 +357,7 @@ pub(crate) fn reorder_struct_array( result_array.fields().clone(), input_fields[parquet_position].is_nullable(), )); - let mut sa: Arc = Arc::new(result_array); - std::mem::swap(&mut input_cols[parquet_position], &mut sa); - final_fields_cols[*index] = Some(( - new_field, - input_cols[parquet_position].clone(), // cheap Arc clone - )); + final_fields_cols[*index] = Some((new_field, Arc::new(result_array))); } ReorderIndex::Index { index } => { final_fields_cols[*index] = Some(( From 2c48f0cfe0cf7f08f06ff3193563c0afa717985a Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 2 Jul 2024 16:20:26 -0700 Subject: [PATCH 14/54] cleaner Vec initialization --- kernel/src/engine/arrow_utils.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 62fee72e5..02be29dfe 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -337,9 +337,7 @@ pub(crate) fn reorder_struct_array( debug!("Have requested reorder {requested_ordering:#?} on {input_data:?}"); let num_rows = input_data.len(); let (input_fields, mut input_cols, null_buffer) = input_data.into_parts(); - let mut final_fields_cols: Vec = std::iter::repeat_with(|| None) - .take(requested_ordering.len()) - .collect(); + let mut final_fields_cols: Vec = vec![None; requested_ordering.len()]; for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data From b695cef3622eb461ed1c74941acd092001a49a74 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 8 Jul 2024 10:36:28 -0700 Subject: [PATCH 15/54] DON'T PANIC --- kernel/src/engine/arrow_utils.rs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 02be29dfe..7d0d77682 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -336,8 +336,9 @@ pub(crate) fn reorder_struct_array( // requested an order different from the parquet, reorder debug!("Have requested reorder {requested_ordering:#?} on {input_data:?}"); let num_rows = input_data.len(); + let num_cols = requested_ordering.len(); let (input_fields, mut input_cols, null_buffer) = input_data.into_parts(); - let mut final_fields_cols: Vec = vec![None; requested_ordering.len()]; + let mut final_fields_cols: Vec = vec![None; num_cols]; for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data @@ -370,17 +371,17 @@ pub(crate) fn reorder_struct_array( } } } - let fields = Fields::from_iter( - final_fields_cols - .iter() - .map(|fco| fco.as_ref().unwrap().0.clone()), // cheap Arc clone - ); - let reordered_columns = final_fields_cols - .into_iter() - .map(|fco| fco.unwrap().1) - .collect(); + let mut field_vec = Vec::with_capacity(num_cols); + let mut reordered_columns = Vec::with_capacity(num_cols); + for field_array_opt in final_fields_cols.into_iter() { + let (field, array) = field_array_opt.ok_or_else(|| Error::generic( + "Found a None in final_fields_cols. This is a kernel bug, please report." + ))?; + field_vec.push(field); + reordered_columns.push(array); + } Ok(StructArray::try_new( - fields, + field_vec.into(), reordered_columns, null_buffer, )?) From baf80e721e9fe52f8edca2ddadcfc5e5c82ee982 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 8 Jul 2024 10:40:19 -0700 Subject: [PATCH 16/54] just use as_struct --- kernel/src/engine/arrow_utils.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 7d0d77682..19b022853 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -7,7 +7,7 @@ use crate::{ DeltaResult, Error, }; -use arrow_array::{new_null_array, Array as ArrowArray, StructArray}; +use arrow_array::{cast::AsArray, new_null_array, Array as ArrowArray, StructArray}; use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields, SchemaRef as ArrowSchemaRef, @@ -337,18 +337,14 @@ pub(crate) fn reorder_struct_array( debug!("Have requested reorder {requested_ordering:#?} on {input_data:?}"); let num_rows = input_data.len(); let num_cols = requested_ordering.len(); - let (input_fields, mut input_cols, null_buffer) = input_data.into_parts(); + let (input_fields, input_cols, null_buffer) = input_data.into_parts(); let mut final_fields_cols: Vec = vec![None; num_cols]; for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data match reorder_index { ReorderIndex::Child { index, children } => { - let mut placeholder: Arc = - Arc::new(StructArray::new_empty_fields(0, None)); - std::mem::swap(&mut input_cols[parquet_position], &mut placeholder); - // placeholder now holds our struct array that we want to reorder - let struct_array: StructArray = placeholder.into_data().into(); + let struct_array = input_cols[parquet_position].as_struct().clone(); let result_array = reorder_struct_array(struct_array, children)?; // create the new field specifying the correct order for the struct let new_field = Arc::new(ArrowField::new_struct( @@ -374,9 +370,11 @@ pub(crate) fn reorder_struct_array( let mut field_vec = Vec::with_capacity(num_cols); let mut reordered_columns = Vec::with_capacity(num_cols); for field_array_opt in final_fields_cols.into_iter() { - let (field, array) = field_array_opt.ok_or_else(|| Error::generic( - "Found a None in final_fields_cols. This is a kernel bug, please report." - ))?; + let (field, array) = field_array_opt.ok_or_else(|| { + Error::generic( + "Found a None in final_fields_cols. This is a kernel bug, please report.", + ) + })?; field_vec.push(field); reordered_columns.push(array); } From 49e258117998186f0d8bfc7aaf910256edf2f3ee Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 8 Jul 2024 17:08:46 -0700 Subject: [PATCH 17/54] handle list of struct sorting --- kernel/src/engine/arrow_utils.rs | 139 ++++++++++++++++++++++++++++--- 1 file changed, 126 insertions(+), 13 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 19b022853..36484f222 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -7,7 +7,10 @@ use crate::{ DeltaResult, Error, }; -use arrow_array::{cast::AsArray, new_null_array, Array as ArrowArray, StructArray}; +use arrow_array::{ + cast::AsArray, new_null_array, Array as ArrowArray, GenericListArray, OffsetSizeTrait, + StructArray, +}; use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields, SchemaRef as ArrowSchemaRef, @@ -126,7 +129,9 @@ fn get_indices( parquet_offset += count_cols(field) - 1; } } - ArrowDataType::List(list_field) | ArrowDataType::ListView(list_field) => { + ArrowDataType::List(list_field) + | ArrowDataType::LargeList(list_field) + | ArrowDataType::ListView(list_field) => { if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { @@ -344,15 +349,41 @@ pub(crate) fn reorder_struct_array( // requested_ordering tells us where it is in the parquet data match reorder_index { ReorderIndex::Child { index, children } => { - let struct_array = input_cols[parquet_position].as_struct().clone(); - let result_array = reorder_struct_array(struct_array, children)?; - // create the new field specifying the correct order for the struct - let new_field = Arc::new(ArrowField::new_struct( - input_fields[parquet_position].name(), - result_array.fields().clone(), - input_fields[parquet_position].is_nullable(), - )); - final_fields_cols[*index] = Some((new_field, Arc::new(result_array))); + match input_cols[parquet_position].data_type() { + ArrowDataType::Struct(_) => { + let struct_array = input_cols[parquet_position].as_struct().clone(); + let result_array = reorder_struct_array(struct_array, children)?; + // create the new field specifying the correct order for the struct + let new_field = Arc::new(ArrowField::new_struct( + input_fields[parquet_position].name(), + result_array.fields().clone(), + input_fields[parquet_position].is_nullable(), + )); + final_fields_cols[*index] = Some((new_field, Arc::new(result_array))); + } + ArrowDataType::List(_) => { + let list_array = input_cols[parquet_position].as_list::().clone(); + final_fields_cols[*index] = reorder_list( + list_array, + input_fields[parquet_position].name(), + children, + )?; + } + ArrowDataType::LargeList(_) => { + let list_array = input_cols[parquet_position].as_list::().clone(); + final_fields_cols[*index] = reorder_list( + list_array, + input_fields[parquet_position].name(), + children, + )?; + } + // TODO: MAP + _ => { + return Err(Error::generic( + "Child reorder can only apply to struct/list/map. This is a kernel bug, please report" + )); + } + } } ReorderIndex::Index { index } => { final_fields_cols[*index] = Some(( @@ -386,12 +417,46 @@ pub(crate) fn reorder_struct_array( } } +fn reorder_list( + list_array: GenericListArray, + input_field_name: &str, + children: &[ReorderIndex], +) -> DeltaResult { + let (list_field, offset_buffer, maybe_sa, null_buf) = list_array.into_parts(); + if let Some(struct_array) = maybe_sa.as_struct_opt() { + let struct_array = struct_array.clone(); + let result_array = Arc::new(reorder_struct_array(struct_array, children)?); + let new_list_field = Arc::new(ArrowField::new_struct( + list_field.name(), + result_array.fields().clone(), + result_array.is_nullable(), + )); + let new_field = Arc::new(ArrowField::new_list( + input_field_name, + new_list_field.clone(), + list_field.is_nullable(), + )); + let list = + GenericListArray::try_new(new_list_field, offset_buffer, result_array, null_buf)?; + Ok(Some((new_field, Arc::new(list)))) + } else { + Err(Error::generic( + "Child reorder of list should have had struct child. This is a kernel bug, please report" + )) + } +} + #[cfg(test)] mod tests { use std::sync::Arc; - use arrow::array::AsArray; - use arrow_array::{ArrayRef as ArrowArrayRef, BooleanArray, Int32Array, StructArray}; + use arrow::{ + array::AsArray, + buffer::{OffsetBuffer, ScalarBuffer}, + }; + use arrow_array::{ + Array, ArrayRef as ArrowArrayRef, BooleanArray, GenericListArray, Int32Array, StructArray, + }; use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, @@ -889,4 +954,52 @@ mod tests { let ordered_s1 = ordered.column(1).as_struct(); assert_eq!(ordered_s1.column_names(), vec!["c", "b"]); } + + #[test] + fn reorder_list_of_struct() { + let boolean = Arc::new(BooleanArray::from(vec![ + false, false, true, true, false, true, + ])); + let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31, 0, 3])); + let list_sa = StructArray::from(vec![ + ( + Arc::new(ArrowField::new("b", ArrowDataType::Boolean, false)), + boolean.clone() as ArrowArrayRef, + ), + ( + Arc::new(ArrowField::new("c", ArrowDataType::Int32, false)), + int.clone() as ArrowArrayRef, + ), + ]); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6])); + let list_field = ArrowField::new("item", list_sa.data_type().clone(), false); + let list = Arc::new(GenericListArray::new( + Arc::new(list_field), + offsets, + Arc::new(list_sa), + None, + )); + let fields: Fields = vec![ + Arc::new(ArrowField::new("b", ArrowDataType::Boolean, false)), + Arc::new(ArrowField::new("c", ArrowDataType::Int32, false)), + ] + .into(); + let list_dt = Arc::new(ArrowField::new( + "list", + ArrowDataType::new_list(ArrowDataType::Struct(fields), false), + false, + )); + let struct_array = StructArray::from(vec![(list_dt, list as ArrowArrayRef)]); + let reorder = vec![ReorderIndex::Child { + index: 0, + children: vec![rii!(1), rii!(0)], + }]; + let ordered = reorder_struct_array(struct_array, &reorder).unwrap(); + let ordered_list_col = ordered.column(0).as_list::(); + for i in 0..ordered_list_col.len() { + let array_item = ordered_list_col.value(i); + let struct_item = array_item.as_struct(); + assert_eq!(struct_item.column_names(), vec!["c", "b"]); + } + } } From 7aa9870dc50a891bd500715422d8c337e1e7b991 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 8 Jul 2024 17:25:36 -0700 Subject: [PATCH 18/54] handle deeper list nesting --- kernel/src/engine/arrow_utils.rs | 50 +++++++++++++------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 36484f222..42bd7d279 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -47,6 +47,14 @@ impl ReorderIndex { } } + fn set_index(&mut self, target_index: usize) { + match self { + ReorderIndex::Child { ref mut index, .. } => *index = target_index, + ReorderIndex::Index { ref mut index } => *index = target_index, + ReorderIndex::Null { ref mut index, .. } => *index = target_index, + } + } + /// check if this indexing is ordered. an `Index` variant is ordered by definition. a `Null` /// variant is not because if we have a `Null` variant we need to do work in /// reorder_struct_array to insert the null col @@ -153,37 +161,19 @@ fn get_indices( // see comment above in struct match arm parquet_offset += parquet_advance - 1; found_fields.insert(requested_field.name()); - // we have to recurse to find the type, but for reordering a list we - // only need a child reordering if the inner type is a struct - if let ArrowDataType::Struct(_) = list_field.data_type() { - if children.len() != 1 { - return Err( - Error::generic( - "List call should not have generated more than one reorder index" - ) - ); - } - // safety, checked that we have 1 element - let mut children = children.into_iter().next().unwrap(); - // the index is wrong though, as it's the index from the inner - // schema. Adjust it to be our index - if let ReorderIndex::Child { - index: ref mut child_index, - .. - } = children - { - *child_index = index; - } else { - return Err( - Error::generic( - "List call should have returned a ReorderIndex::Child variant" - ) - ); - } - reorder_indices.push(children); - } else { - reorder_indices.push(ReorderIndex::Index { index }); + if children.len() != 1 { + return Err( + Error::generic( + "List call should not have generated more than one reorder index" + ) + ); } + // safety, checked that we have 1 element + let mut children = children.into_iter().next().unwrap(); + // the index is wrong, as it's the index from the inner schema. Adjust + // it to be our index + children.set_index(index); + reorder_indices.push(children); } _ => { return Err(Error::unexpected_column_type(list_field.name())); From f76c44e1df4b04a038758fd98874f55afea01aad Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 9 Jul 2024 15:51:03 -0700 Subject: [PATCH 19/54] Fix comment --- kernel/src/engine/arrow_utils.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 42bd7d279..1ef4ac032 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -281,9 +281,8 @@ pub(crate) fn get_requested_indices( Ok((mask_indices, reorder_indexes)) } -/// Create a mask that will only select the specified indices from the parquet. Currently we only -/// handle "root" level columns, and hence use `ProjectionMask::roots`, but will support leaf -/// selection in the future. See issues #86 and #96 as well. +/// Create a mask that will only select the specified indices from the parquet. `indices` can be +/// computed from a [`Schema`] using [`get_requested_indices`] pub(crate) fn generate_mask( _requested_schema: &SchemaRef, _parquet_schema: &ArrowSchemaRef, From 4cd8aaf08b07f2cabf04d6916ca70f92480c35c1 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 10:46:54 -0700 Subject: [PATCH 20/54] use Into::into --- kernel/src/engine/default/parquet.rs | 6 +++--- kernel/src/engine/sync/parquet.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index d8240dd8b..46bff22cb 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -142,7 +142,7 @@ impl FileOpener for ParquetOpener { let stream = stream.map(move |rbr| { // re-order each batch if needed rbr.map_err(Error::Parquet).and_then(|rb| { - reorder_struct_array(rb.into(), &requested_ordering).map(|sa| sa.into()) + reorder_struct_array(rb.into(), &requested_ordering).map(Into::into) }) }); Ok(stream.boxed()) @@ -206,7 +206,7 @@ impl FileOpener for PresignedUrlOpener { let stream = stream.map(move |rbr| { // re-order each batch if needed rbr.map_err(Error::Arrow).and_then(|rb| { - reorder_struct_array(rb.into(), &requested_ordering).map(|sa| sa.into()) + reorder_struct_array(rb.into(), &requested_ordering).map(Into::into) }) }); Ok(stream.boxed()) @@ -234,7 +234,7 @@ mod tests { ) -> DeltaResult { engine_data .and_then(ArrowEngineData::try_from_engine_data) - .map(|sd| sd.into()) + .map(Into::into) } #[tokio::test] diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index 01a490bc5..cd25a926a 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -29,7 +29,7 @@ fn try_create_from_parquet(schema: SchemaRef, location: Url) -> DeltaResult Date: Wed, 10 Jul 2024 10:50:30 -0700 Subject: [PATCH 21/54] make rii! a const fn --- kernel/src/engine/arrow_utils.rs | 68 ++++++++++++++++---------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 1ef4ac032..0b99979ff 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -455,10 +455,8 @@ mod tests { use super::{get_requested_indices, reorder_struct_array, ReorderIndex}; - macro_rules! rii { - ($index: expr) => {{ - ReorderIndex::Index { index: $index } - }}; + const fn rii(index: usize) -> ReorderIndex { + ReorderIndex::Index { index } } fn nested_arrow_schema() -> ArrowSchemaRef { @@ -494,7 +492,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![rii!(0), rii!(1), rii!(2)]; + let expect_reorder = vec![rii(0), rii(1), rii(2)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -514,7 +512,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![rii!(2), rii!(0), rii!(1)]; + let expect_reorder = vec![rii(2), rii(0), rii(1)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -534,8 +532,8 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1]; let expect_reorder = vec![ - rii!(0), - rii!(2), + rii(0), + rii(2), ReorderIndex::Null { index: 1, field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), @@ -564,12 +562,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - rii!(0), + rii(0), ReorderIndex::Child { index: 1, - children: vec![rii!(0), rii!(1)], + children: vec![rii(0), rii(1)], }, - rii!(2), + rii(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -594,12 +592,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - rii!(2), + rii(2), ReorderIndex::Child { index: 0, - children: vec![rii!(1), rii!(0)], + children: vec![rii(1), rii(0)], }, - rii!(1), + rii(1), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -621,12 +619,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - rii!(0), + rii(0), ReorderIndex::Child { index: 1, - children: vec![rii!(0)], + children: vec![rii(0)], }, - rii!(2), + rii(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -655,7 +653,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![rii!(0), rii!(1), rii!(2)]; + let expect_reorder = vec![rii(0), rii(1), rii(2)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -701,12 +699,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - rii!(0), + rii(0), ReorderIndex::Child { index: 1, - children: vec![rii!(0), rii!(1)], + children: vec![rii(0), rii(1)], }, - rii!(2), + rii(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -750,12 +748,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - rii!(0), + rii(0), ReorderIndex::Child { index: 1, - children: vec![rii!(0)], + children: vec![rii(0)], }, - rii!(2), + rii(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -803,12 +801,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 2, 3, 4]; let expect_reorder = vec![ - rii!(0), + rii(0), ReorderIndex::Child { index: 1, - children: vec![rii!(1), rii!(0)], + children: vec![rii(1), rii(0)], }, - rii!(2), + rii(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -858,12 +856,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![2, 3, 4, 5]; let expect_reorder = vec![ - rii!(2), + rii(2), ReorderIndex::Child { index: 1, - children: vec![rii!(0), rii!(1)], + children: vec![rii(0), rii(1)], }, - rii!(0), + rii(0), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -887,7 +885,7 @@ mod tests { #[test] fn simple_reorder_struct() { let arry = make_struct_array(); - let reorder = vec![rii!(1), rii!(0)]; + let reorder = vec![rii(1), rii(0)]; let ordered = reorder_struct_array(arry, &reorder).unwrap(); assert_eq!(ordered.column_names(), vec!["c", "b"]); } @@ -922,13 +920,13 @@ mod tests { let reorder = vec![ ReorderIndex::Child { index: 1, - children: vec![rii!(1), rii!(0)], + children: vec![rii(1), rii(0)], }, ReorderIndex::Child { index: 0, children: vec![ - rii!(0), - rii!(1), + rii(0), + rii(1), ReorderIndex::Null { index: 2, field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), @@ -981,7 +979,7 @@ mod tests { let struct_array = StructArray::from(vec![(list_dt, list as ArrowArrayRef)]); let reorder = vec![ReorderIndex::Child { index: 0, - children: vec![rii!(1), rii!(0)], + children: vec![rii(1), rii(0)], }]; let ordered = reorder_struct_array(struct_array, &reorder).unwrap(); let ordered_list_col = ordered.column(0).as_list::(); From 3c55f88a0409bbd1b778a16fb4fea936292c7069 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 10:51:27 -0700 Subject: [PATCH 22/54] arc earlier --- kernel/src/engine/arrow_utils.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 0b99979ff..66a1a73e5 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -426,8 +426,8 @@ fn reorder_list( list_field.is_nullable(), )); let list = - GenericListArray::try_new(new_list_field, offset_buffer, result_array, null_buf)?; - Ok(Some((new_field, Arc::new(list)))) + Arc::new(GenericListArray::try_new(new_list_field, offset_buffer, result_array, null_buf)?); + Ok(Some((new_field, list))) } else { Err(Error::generic( "Child reorder of list should have had struct child. This is a kernel bug, please report" From 07a750c40b5478be2e5377847c4fc039ffe76a13 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 12:14:41 -0700 Subject: [PATCH 23/54] move ensure_data_types, and use in computation of incicies --- kernel/src/engine/arrow_expression.rs | 89 +-------------- kernel/src/engine/arrow_utils.rs | 149 ++++++++++++++++++++++++-- 2 files changed, 142 insertions(+), 96 deletions(-) diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs index bd11fdb3c..61d2a3234 100644 --- a/kernel/src/engine/arrow_expression.rs +++ b/kernel/src/engine/arrow_expression.rs @@ -17,10 +17,10 @@ use itertools::Itertools; use super::arrow_conversion::LIST_ARRAY_ROOT; use crate::engine::arrow_data::ArrowEngineData; +use crate::engine::arrow_utils::ensure_data_types; use crate::error::{DeltaResult, Error}; use crate::expressions::{BinaryOperator, Expression, Scalar, UnaryOperator, VariadicOperator}; use crate::schema::{DataType, PrimitiveType, SchemaRef}; -use crate::utils::require; use crate::{EngineData, ExpressionEvaluator, ExpressionHandler}; // TODO leverage scalars / Datum @@ -161,93 +161,6 @@ fn column_as_struct<'a>( .ok_or(ArrowError::SchemaError(format!("{} is not a struct", name))) } -fn make_arrow_error(s: String) -> Error { - Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(s)) -} - -/// Ensure a kernel data type matches an arrow data type. This only ensures that the actual "type" -/// is the same, but does so recursively into structs, and ensures lists and maps have the correct -/// associated types as well. This returns an `Ok(())` if the types are compatible, or an error if -/// the types do not match. If there is a `struct` type included, we only ensure that the named -/// fields that the kernel is asking for exist, and that for those fields the types -/// match. Un-selected fields are ignored. -fn ensure_data_types(kernel_type: &DataType, arrow_type: &ArrowDataType) -> DeltaResult<()> { - match (kernel_type, arrow_type) { - (DataType::Primitive(_), _) if arrow_type.is_primitive() => Ok(()), - (DataType::Primitive(PrimitiveType::Boolean), ArrowDataType::Boolean) - | (DataType::Primitive(PrimitiveType::String), ArrowDataType::Utf8) - | (DataType::Primitive(PrimitiveType::Binary), ArrowDataType::Binary) => { - // strings, bools, and binary aren't primitive in arrow - Ok(()) - } - ( - DataType::Primitive(PrimitiveType::Decimal(kernel_prec, kernel_scale)), - ArrowDataType::Decimal128(arrow_prec, arrow_scale), - ) if arrow_prec == kernel_prec && *arrow_scale == *kernel_scale as i8 => { - // decimal isn't primitive in arrow. cast above is okay as we limit range - Ok(()) - } - (DataType::Array(inner_type), ArrowDataType::List(arrow_list_type)) => { - let kernel_array_type = &inner_type.element_type; - let arrow_list_type = arrow_list_type.data_type(); - ensure_data_types(kernel_array_type, arrow_list_type) - } - (DataType::Map(kernel_map_type), ArrowDataType::Map(arrow_map_type, _)) => { - if let ArrowDataType::Struct(fields) = arrow_map_type.data_type() { - let mut fiter = fields.iter(); - if let Some(key_type) = fiter.next() { - ensure_data_types(&kernel_map_type.key_type, key_type.data_type())?; - } else { - return Err(make_arrow_error( - "Arrow map struct didn't have a key type".to_string(), - )); - } - if let Some(value_type) = fiter.next() { - ensure_data_types(&kernel_map_type.value_type, value_type.data_type())?; - } else { - return Err(make_arrow_error( - "Arrow map struct didn't have a value type".to_string(), - )); - } - Ok(()) - } else { - Err(make_arrow_error( - "Arrow map type wasn't a struct.".to_string(), - )) - } - } - (DataType::Struct(kernel_fields), ArrowDataType::Struct(arrow_fields)) => { - // build a list of kernel fields that matches the order of the arrow fields - let mapped_fields = arrow_fields - .iter() - .flat_map(|f| kernel_fields.fields.get(f.name())); - - // keep track of how many fields we matched up - let mut found_fields = 0; - // ensure that for the fields that we found, the types match - for (kernel_field, arrow_field) in mapped_fields.zip(arrow_fields) { - ensure_data_types(&kernel_field.data_type, arrow_field.data_type())?; - found_fields += 1; - } - - // require that we found the number of fields that we requested. - require!(kernel_fields.fields.len() == found_fields, { - let kernel_field_names = kernel_fields.fields.keys().join(", "); - let arrow_field_names = arrow_fields.iter().map(|f| f.name()).join(", "); - make_arrow_error(format!( - "Missing Struct fields. Requested: {}, found: {}", - kernel_field_names, arrow_field_names, - )) - }); - Ok(()) - } - _ => Err(make_arrow_error(format!( - "Incorrect datatype. Expected {}, got {}", - kernel_type, arrow_type - ))), - } -} - fn evaluate_expression( expression: &Expression, batch: &RecordBatch, diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 66a1a73e5..ae0959609 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -3,7 +3,8 @@ use std::{collections::HashSet, sync::Arc}; use crate::{ - schema::{DataType, Schema, SchemaRef, StructField, StructType}, + schema::{DataType, PrimitiveType, Schema, SchemaRef, StructField, StructType}, + utils::require, DeltaResult, Error, }; @@ -15,9 +16,110 @@ use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields, SchemaRef as ArrowSchemaRef, }; +use itertools::Itertools; use parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; use tracing::debug; +fn make_arrow_error(s: String) -> Error { + Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(s)) +} + +/// Ensure a kernel data type matches an arrow data type. This only ensures that the actual "type" +/// is the same, but does so recursively into structs, and ensures lists and maps have the correct +/// associated types as well. This returns an `Ok(())` if the types are compatible, or an error if +/// the types do not match. If there is a `struct` type included, we only ensure that the named +/// fields that the kernel is asking for exist, and that for those fields the types +/// match. Un-selected fields are ignored. +pub(crate) fn ensure_data_types( + kernel_type: &DataType, + arrow_type: &ArrowDataType, +) -> DeltaResult<()> { + match (kernel_type, arrow_type) { + (DataType::Primitive(_), _) if arrow_type.is_primitive() => { + let converted_type: ArrowDataType = kernel_type.try_into()?; + if &converted_type == arrow_type { + Ok(()) + } else { + Err(make_arrow_error(format!( + "Incorrect datatype. Expected {}, got {}", + kernel_type, arrow_type + ))) + } + } + (DataType::Primitive(PrimitiveType::Boolean), ArrowDataType::Boolean) + | (DataType::Primitive(PrimitiveType::String), ArrowDataType::Utf8) + | (DataType::Primitive(PrimitiveType::Binary), ArrowDataType::Binary) => { + // strings, bools, and binary aren't primitive in arrow + Ok(()) + } + ( + DataType::Primitive(PrimitiveType::Decimal(kernel_prec, kernel_scale)), + ArrowDataType::Decimal128(arrow_prec, arrow_scale), + ) if arrow_prec == kernel_prec && *arrow_scale == *kernel_scale as i8 => { + // decimal isn't primitive in arrow. cast above is okay as we limit range + Ok(()) + } + (DataType::Array(inner_type), ArrowDataType::List(arrow_list_type)) => { + let kernel_array_type = &inner_type.element_type; + let arrow_list_type = arrow_list_type.data_type(); + ensure_data_types(kernel_array_type, arrow_list_type) + } + (DataType::Map(kernel_map_type), ArrowDataType::Map(arrow_map_type, _)) => { + if let ArrowDataType::Struct(fields) = arrow_map_type.data_type() { + let mut fiter = fields.iter(); + if let Some(key_type) = fiter.next() { + ensure_data_types(&kernel_map_type.key_type, key_type.data_type())?; + } else { + return Err(make_arrow_error( + "Arrow map struct didn't have a key type".to_string(), + )); + } + if let Some(value_type) = fiter.next() { + ensure_data_types(&kernel_map_type.value_type, value_type.data_type())?; + } else { + return Err(make_arrow_error( + "Arrow map struct didn't have a value type".to_string(), + )); + } + Ok(()) + } else { + Err(make_arrow_error( + "Arrow map type wasn't a struct.".to_string(), + )) + } + } + (DataType::Struct(kernel_fields), ArrowDataType::Struct(arrow_fields)) => { + // build a list of kernel fields that matches the order of the arrow fields + let mapped_fields = arrow_fields + .iter() + .flat_map(|f| kernel_fields.fields.get(f.name())); + + // keep track of how many fields we matched up + let mut found_fields = 0; + // ensure that for the fields that we found, the types match + for (kernel_field, arrow_field) in mapped_fields.zip(arrow_fields) { + ensure_data_types(&kernel_field.data_type, arrow_field.data_type())?; + found_fields += 1; + } + + // require that we found the number of fields that we requested. + require!(kernel_fields.fields.len() == found_fields, { + let kernel_field_names = kernel_fields.fields.keys().join(", "); + let arrow_field_names = arrow_fields.iter().map(|f| f.name()).join(", "); + make_arrow_error(format!( + "Missing Struct fields. Requested: {}, found: {}", + kernel_field_names, arrow_field_names, + )) + }); + Ok(()) + } + _ => Err(make_arrow_error(format!( + "Incorrect datatype. Expected {}, got {}", + kernel_type, arrow_type + ))), + } +} + /// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index /// represents a column that will be in the read parquet data at that level and index. The `index()` /// of the element is the position that the column should appear in the final output. If it is a @@ -74,8 +176,8 @@ fn count_cols(field: &ArrowField) -> usize { fn _count_cols(dt: &ArrowDataType) -> usize { match dt { - ArrowDataType::Struct(fields) => fields.iter().fold(0, |acc, f| acc + count_cols(f)), - ArrowDataType::Union(fields, _) => fields.iter().fold(0, |acc, (_, f)| acc + count_cols(f)), + ArrowDataType::Struct(fields) => fields.iter().map(|f| count_cols(f)).sum(), + ArrowDataType::Union(fields, _) => fields.iter().map(|(_, f)| count_cols(f)).sum(), ArrowDataType::List(field) | ArrowDataType::LargeList(field) | ArrowDataType::FixedSizeList(field, _) @@ -228,6 +330,7 @@ fn get_indices( if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { + ensure_data_types(&requested_field.data_type, field.data_type())?; found_fields.insert(requested_field.name()); mask_indices.push(parquet_offset + parquet_index); reorder_indices.push(ReorderIndex::Index { index }); @@ -381,9 +484,9 @@ pub(crate) fn reorder_struct_array( )); } ReorderIndex::Null { index, field } => { - let null_arry = Arc::new(new_null_array(field.data_type(), num_rows)); + let null_array = Arc::new(new_null_array(field.data_type(), num_rows)); let field = field.clone(); // cheap Arc clone - final_fields_cols[*index] = Some((field, null_arry)); + final_fields_cols[*index] = Some((field, null_array)); } } } @@ -425,8 +528,12 @@ fn reorder_list( new_list_field.clone(), list_field.is_nullable(), )); - let list = - Arc::new(GenericListArray::try_new(new_list_field, offset_buffer, result_array, null_buf)?); + let list = Arc::new(GenericListArray::try_new( + new_list_field, + offset_buffer, + result_array, + null_buf, + )?); Ok(Some((new_field, list))) } else { Err(Error::generic( @@ -497,6 +604,32 @@ mod tests { assert_eq!(reorder_indices, expect_reorder); } + #[test] + fn ensure_data_types_fails_correctly() { + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new("s", DataType::INTEGER, true), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("s", ArrowDataType::Utf8, true), + ])); + let res = get_requested_indices(&kernel_schema, &arrow_schema); + assert!(res.is_err()); + + let kernel_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new("s", DataType::STRING, true), + ])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("s", ArrowDataType::Int32, true), + ])); + let res = get_requested_indices(&kernel_schema, &arrow_schema); + println!("{res:#?}"); + assert!(res.is_err()); + } + #[test] fn simple_reorder_indices() { let kernel_schema = Arc::new(StructType::new(vec![ @@ -767,7 +900,7 @@ mod tests { "list", ArrayType::new( StructType::new(vec![ - StructField::new("string", DataType::INTEGER, false), + StructField::new("string", DataType::STRING, false), StructField::new("int2", DataType::INTEGER, false), ]) .into(), From 2937f77255705676a5b29ac7554d8e569392277a Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 12:16:24 -0700 Subject: [PATCH 24/54] use ok_or_else for map key/val --- kernel/src/engine/arrow_utils.rs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index ae0959609..fd16e3582 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -291,16 +291,12 @@ fn get_indices( (ArrowDataType::Struct(inner_fields), DataType::Map(map_type)) => { let mut key_val_names = inner_fields.iter().map(|f| f.name().to_string()); - let key_name = if let Some(key_name) = key_val_names.next() { - key_name - } else { - return Err(Error::generic("map fields didn't include a key col")); - }; - let val_name = if let Some(val_name) = key_val_names.next() { - val_name - } else { - return Err(Error::generic("map fields didn't include a val col")); - }; + let key_name = key_val_names.next().ok_or_else(|| { + Error::generic("map fields didn't include a key col") + })?; + let val_name = key_val_names.next().ok_or_else(|| { + Error::generic("map fields didn't include a val col") + })?; if key_val_names.next().is_some() { return Err(Error::generic("map fields had more than 2 members")); } From 5bcc2ee47882011672b35e149b5ed10bed5668b8 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 12:36:53 -0700 Subject: [PATCH 25/54] parquet files actually contain integers, not logns --- kernel/tests/read.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index ed2f15240..275b18791 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -27,7 +27,7 @@ const PARQUET_FILE2: &str = "part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c00 const METADATA: &str = r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isBlindAppend":true}} {"protocol":{"minReaderVersion":1,"minWriterVersion":2}} -{"metaData":{"id":"5fba94ed-9794-4965-ba6e-6ee3c0d22af9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"val\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1587968585495}}"#; +{"metaData":{"id":"5fba94ed-9794-4965-ba6e-6ee3c0d22af9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"val\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1587968585495}}"#; enum TestAction { Add(String), @@ -301,7 +301,7 @@ async fn stats() -> Result<(), Box> { use BinaryOperator::{ Equal, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, NotEqual, }; - let test_cases: Vec<(_, i64, _)> = vec![ + let test_cases: Vec<(_, i32, _)> = vec![ (Equal, 0, vec![]), (Equal, 1, vec![&batch1]), (Equal, 3, vec![&batch1]), From ab8438894fdaa37c3a2af04764395a5ca2ba46ba Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 14:55:55 -0700 Subject: [PATCH 26/54] add simple map test --- kernel/src/engine/arrow_utils.rs | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index fd16e3582..174405ad7 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -554,7 +554,7 @@ mod tests { SchemaRef as ArrowSchemaRef, }; - use crate::schema::{ArrayType, DataType, StructField, StructType}; + use crate::schema::{ArrayType, DataType, MapType, StructField, StructType}; use super::{get_requested_indices, reorder_struct_array, ReorderIndex}; @@ -626,6 +626,33 @@ mod tests { assert!(res.is_err()); } + #[test] + fn mask_with_map() { + let kernel_schema = Arc::new(StructType::new(vec![StructField::new( + "map", + DataType::Map(Box::new(MapType::new( + DataType::INTEGER, + DataType::STRING, + false, + ))), + false, + )])); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new_map( + "map", + "entries", + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("s", ArrowDataType::Utf8, false), + false, + false, + )])); + let (mask_indices, reorder_indices) = + get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + let expect_mask = vec![0, 1]; + let expect_reorder = vec![rii(0)]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + #[test] fn simple_reorder_indices() { let kernel_schema = Arc::new(StructType::new(vec![ From 189767f4c30344f0e65a14641542ad4de097398d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 14:58:17 -0700 Subject: [PATCH 27/54] iflet better --- kernel/src/engine/arrow_utils.rs | 99 +++++++++++++++----------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 174405ad7..6d64576ff 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -210,26 +210,24 @@ fn get_indices( if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { - match requested_field.data_type { - DataType::Struct(ref requested_schema) => { - let (parquet_advance, children) = get_indices( - parquet_index + parquet_offset, - requested_schema.as_ref(), - fields, - mask_indices, - )?; - // advance the number of parquet fields, but subtract 1 because the - // struct will be counted by the `enumerate` call but doesn't count as - // an actual index. - parquet_offset += parquet_advance - 1; - // note that we found this field - found_fields.insert(requested_field.name()); - // push the child reorder on - reorder_indices.push(ReorderIndex::Child { index, children }); - } - _ => { - return Err(Error::unexpected_column_type(field.name())); - } + if let DataType::Struct(ref requested_schema) = requested_field.data_type { + let (parquet_advance, children) = get_indices( + parquet_index + parquet_offset, + requested_schema.as_ref(), + fields, + mask_indices, + )?; + // advance the number of parquet fields, but subtract 1 because the + // struct will be counted by the `enumerate` call but doesn't count as + // an actual index. + parquet_offset += parquet_advance - 1; + // note that we found this field + found_fields.insert(requested_field.name()); + // push the child reorder on + reorder_indices.push(ReorderIndex::Child { index, children }); + } + else { + return Err(Error::unexpected_column_type(field.name())); } } else { // We're NOT selecting this field, but we still need to update how much we skip @@ -247,39 +245,36 @@ fn get_indices( { // we just want to transparently recurse into lists, need to transform the kernel // list data type into a schema - match requested_field.data_type() { - DataType::Array(array_type) => { - let requested_schema = StructType::new(vec![StructField::new( - list_field.name().clone(), // so we find it in the inner call - array_type.element_type.clone(), - array_type.contains_null, - )]); - let (parquet_advance, children) = get_indices( - found_fields.len() + parquet_offset, - &requested_schema, - &[list_field.clone()].into(), - mask_indices, - )?; - // see comment above in struct match arm - parquet_offset += parquet_advance - 1; - found_fields.insert(requested_field.name()); - if children.len() != 1 { - return Err( - Error::generic( - "List call should not have generated more than one reorder index" - ) - ); - } - // safety, checked that we have 1 element - let mut children = children.into_iter().next().unwrap(); - // the index is wrong, as it's the index from the inner schema. Adjust - // it to be our index - children.set_index(index); - reorder_indices.push(children); - } - _ => { - return Err(Error::unexpected_column_type(list_field.name())); + if let DataType::Array(array_type) = requested_field.data_type() { + let requested_schema = StructType::new(vec![StructField::new( + list_field.name().clone(), // so we find it in the inner call + array_type.element_type.clone(), + array_type.contains_null, + )]); + let (parquet_advance, children) = get_indices( + found_fields.len() + parquet_offset, + &requested_schema, + &[list_field.clone()].into(), + mask_indices, + )?; + // see comment above in struct match arm + parquet_offset += parquet_advance - 1; + found_fields.insert(requested_field.name()); + if children.len() != 1 { + return Err( + Error::generic( + "List call should not have generated more than one reorder index" + ) + ); } + // safety, checked that we have 1 element + let mut children = children.into_iter().next().unwrap(); + // the index is wrong, as it's the index from the inner schema. Adjust + // it to be our index + children.set_index(index); + reorder_indices.push(children); + } else { + return Err(Error::unexpected_column_type(list_field.name())); } } } From 94b1dc905ae0d216ccb932f94b4e9d237cdca1b9 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 14:59:12 -0700 Subject: [PATCH 28/54] no need for ref --- kernel/src/engine/arrow_utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 6d64576ff..22f67fdd4 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -162,7 +162,7 @@ impl ReorderIndex { /// reorder_struct_array to insert the null col fn is_ordered(&self) -> bool { match self { - ReorderIndex::Child { ref children, .. } => is_ordered(children), + ReorderIndex::Child { children, .. } => is_ordered(children), ReorderIndex::Index { .. } => true, ReorderIndex::Null { .. } => false, } From 41d177e129a848e01c732e96881c43d7a73d75bd Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 15:01:51 -0700 Subject: [PATCH 29/54] Null -> Missing --- kernel/src/engine/arrow_utils.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 22f67fdd4..7bb595f2a 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -134,7 +134,7 @@ pub(crate) enum ReorderIndex { Index { index: usize, }, - Null { + Missing { index: usize, field: ArrowFieldRef, }, @@ -145,7 +145,7 @@ impl ReorderIndex { match self { ReorderIndex::Child { index, .. } => *index, ReorderIndex::Index { index, .. } => *index, - ReorderIndex::Null { index, .. } => *index, + ReorderIndex::Missing { index, .. } => *index, } } @@ -153,7 +153,7 @@ impl ReorderIndex { match self { ReorderIndex::Child { ref mut index, .. } => *index = target_index, ReorderIndex::Index { ref mut index } => *index = target_index, - ReorderIndex::Null { ref mut index, .. } => *index = target_index, + ReorderIndex::Missing { ref mut index, .. } => *index = target_index, } } @@ -164,7 +164,7 @@ impl ReorderIndex { match self { ReorderIndex::Child { children, .. } => is_ordered(children), ReorderIndex::Index { .. } => true, - ReorderIndex::Null { .. } => false, + ReorderIndex::Missing { .. } => false, } } } @@ -335,7 +335,7 @@ fn get_indices( if !found_fields.contains(field.name()) { if field.nullable { debug!("Inserting missing and nullable field: {}", field.name()); - reorder_indices.push(ReorderIndex::Null { + reorder_indices.push(ReorderIndex::Missing { index: requested_position, field: Arc::new(field.try_into()?), }); @@ -474,7 +474,7 @@ pub(crate) fn reorder_struct_array( input_cols[parquet_position].clone(), // cheap Arc clone )); } - ReorderIndex::Null { index, field } => { + ReorderIndex::Missing { index, field } => { let null_array = Arc::new(new_null_array(field.data_type(), num_rows)); let field = field.clone(); // cheap Arc clone final_fields_cols[*index] = Some((field, null_array)); @@ -685,7 +685,7 @@ mod tests { let expect_reorder = vec![ rii(0), rii(2), - ReorderIndex::Null { + ReorderIndex::Missing { index: 1, field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), }, @@ -1078,7 +1078,7 @@ mod tests { children: vec![ rii(0), rii(1), - ReorderIndex::Null { + ReorderIndex::Missing { index: 2, field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), }, From 0210bb78ad8d9262c8d5ff7b70daed5f8d626f1d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 16:05:09 -0700 Subject: [PATCH 30/54] refactor ReorderIndex into a struct --- kernel/src/engine/arrow_utils.rs | 268 ++++++++++++++++--------------- 1 file changed, 136 insertions(+), 132 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 7bb595f2a..76881a1b6 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -126,45 +126,47 @@ pub(crate) fn ensure_data_types( /// `Child` variant, then at that index there is a `Struct` whose ordering is specified by the /// values in the associated `Vec` according to these same rules. #[derive(Debug, PartialEq)] -pub(crate) enum ReorderIndex { - Child { - index: usize, - children: Vec, - }, - Index { - index: usize, - }, - Missing { - index: usize, - field: ArrowFieldRef, - }, +pub(crate) struct ReorderIndex { + pub(crate) index: usize, + kind: ReorderIndexKind, +} + +#[derive(Debug, PartialEq)] +pub(crate) enum ReorderIndexKind { + Child(Vec), + Index, + Missing(ArrowFieldRef), } impl ReorderIndex { - fn index(&self) -> usize { - match self { - ReorderIndex::Child { index, .. } => *index, - ReorderIndex::Index { index, .. } => *index, - ReorderIndex::Missing { index, .. } => *index, + fn new_child(index: usize, children: Vec) -> Self { + ReorderIndex { + index, + kind: ReorderIndexKind::Child(children), } } - fn set_index(&mut self, target_index: usize) { - match self { - ReorderIndex::Child { ref mut index, .. } => *index = target_index, - ReorderIndex::Index { ref mut index } => *index = target_index, - ReorderIndex::Missing { ref mut index, .. } => *index = target_index, + fn new_index(index: usize) -> Self { + ReorderIndex { + index, + kind: ReorderIndexKind::Index, } } - /// check if this indexing is ordered. an `Index` variant is ordered by definition. a `Null` - /// variant is not because if we have a `Null` variant we need to do work in - /// reorder_struct_array to insert the null col - fn is_ordered(&self) -> bool { - match self { - ReorderIndex::Child { children, .. } => is_ordered(children), - ReorderIndex::Index { .. } => true, - ReorderIndex::Missing { .. } => false, + fn new_missing(index: usize, field: ArrowFieldRef) -> Self { + ReorderIndex { + index, + kind: ReorderIndexKind::Missing(field), + } + } + + /// Check if this reordering contains a `Missing` variant anywhere. See comment below on + /// [`is_ordered`] to understand why this is needed. + fn contains_missing(&self) -> bool { + match self.kind { + ReorderIndexKind::Child(ref children) => is_ordered(children), + ReorderIndexKind::Index => true, + ReorderIndexKind::Missing(_) => false, } } } @@ -224,9 +226,8 @@ fn get_indices( // note that we found this field found_fields.insert(requested_field.name()); // push the child reorder on - reorder_indices.push(ReorderIndex::Child { index, children }); - } - else { + reorder_indices.push(ReorderIndex::new_child(index, children)); + } else { return Err(Error::unexpected_column_type(field.name())); } } else { @@ -261,17 +262,15 @@ fn get_indices( parquet_offset += parquet_advance - 1; found_fields.insert(requested_field.name()); if children.len() != 1 { - return Err( - Error::generic( - "List call should not have generated more than one reorder index" - ) - ); + return Err(Error::generic( + "List call should not have generated more than one reorder index", + )); } // safety, checked that we have 1 element let mut children = children.into_iter().next().unwrap(); // the index is wrong, as it's the index from the inner schema. Adjust // it to be our index - children.set_index(index); + children.index = index; reorder_indices.push(children); } else { return Err(Error::unexpected_column_type(list_field.name())); @@ -309,7 +308,7 @@ fn get_indices( // note that we found this field found_fields.insert(requested_field.name()); // push the child reorder on, currently no reordering for maps - reorder_indices.push(ReorderIndex::Index { index }); + reorder_indices.push(ReorderIndex::new_index(index)); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -324,7 +323,7 @@ fn get_indices( ensure_data_types(&requested_field.data_type, field.data_type())?; found_fields.insert(requested_field.name()); mask_indices.push(parquet_offset + parquet_index); - reorder_indices.push(ReorderIndex::Index { index }); + reorder_indices.push(ReorderIndex::new_index(index)); } } } @@ -335,10 +334,10 @@ fn get_indices( if !found_fields.contains(field.name()) { if field.nullable { debug!("Inserting missing and nullable field: {}", field.name()); - reorder_indices.push(ReorderIndex::Missing { - index: requested_position, - field: Arc::new(field.try_into()?), - }); + reorder_indices.push(ReorderIndex::new_missing( + requested_position, + Arc::new(field.try_into()?), + )); } else { return Err(Error::Generic(format!( "Requested field not found in parquet schema, and field is not nullable: {}", @@ -391,19 +390,24 @@ pub(crate) fn generate_mask( )) } -/// Check if an ordering is already ordered +/// Check if an ordering is already ordered. We check if the indices are in ascending order. That's +/// enough to ensure we don't need to do any transformation on the data read from parquet _iff_ +/// there are no `null` columns to insert. If we _do_ need to insert a null column then we need to +/// transform the data. Therefore we also call [`contains_missing`] to ensure both the ascending +/// nature of the indices AND that no `Missing` variants exist, and only if both are true do we +/// consider an ordering "ordered". fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { if requested_ordering.is_empty() { return true; } // we have >=1 element. check that the first element is ordered - if !requested_ordering[0].is_ordered() { + if !requested_ordering[0].contains_missing() { return false; } // now check that all elements are ordered wrt. each other, and are internally ordered requested_ordering .windows(2) - .all(|ri| (ri[0].index() < ri[1].index()) && ri[1].is_ordered()) + .all(|ri| (ri[0].index < ri[1].index) && ri[1].contains_missing()) } // we use this as a placeholder for an array and its associated field. We can fill in a Vec of None @@ -430,8 +434,8 @@ pub(crate) fn reorder_struct_array( for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data - match reorder_index { - ReorderIndex::Child { index, children } => { + match &reorder_index.kind { + ReorderIndexKind::Child(children) => { match input_cols[parquet_position].data_type() { ArrowDataType::Struct(_) => { let struct_array = input_cols[parquet_position].as_struct().clone(); @@ -442,11 +446,12 @@ pub(crate) fn reorder_struct_array( result_array.fields().clone(), input_fields[parquet_position].is_nullable(), )); - final_fields_cols[*index] = Some((new_field, Arc::new(result_array))); + final_fields_cols[reorder_index.index] = + Some((new_field, Arc::new(result_array))); } ArrowDataType::List(_) => { let list_array = input_cols[parquet_position].as_list::().clone(); - final_fields_cols[*index] = reorder_list( + final_fields_cols[reorder_index.index] = reorder_list( list_array, input_fields[parquet_position].name(), children, @@ -454,7 +459,7 @@ pub(crate) fn reorder_struct_array( } ArrowDataType::LargeList(_) => { let list_array = input_cols[parquet_position].as_list::().clone(); - final_fields_cols[*index] = reorder_list( + final_fields_cols[reorder_index.index] = reorder_list( list_array, input_fields[parquet_position].name(), children, @@ -468,16 +473,16 @@ pub(crate) fn reorder_struct_array( } } } - ReorderIndex::Index { index } => { - final_fields_cols[*index] = Some(( + ReorderIndexKind::Index => { + final_fields_cols[reorder_index.index] = Some(( input_fields[parquet_position].clone(), // cheap Arc clone input_cols[parquet_position].clone(), // cheap Arc clone )); } - ReorderIndex::Missing { index, field } => { + ReorderIndexKind::Missing(field) => { let null_array = Arc::new(new_null_array(field.data_type(), num_rows)); let field = field.clone(); // cheap Arc clone - final_fields_cols[*index] = Some((field, null_array)); + final_fields_cols[reorder_index.index] = Some((field, null_array)); } } } @@ -553,10 +558,6 @@ mod tests { use super::{get_requested_indices, reorder_struct_array, ReorderIndex}; - const fn rii(index: usize) -> ReorderIndex { - ReorderIndex::Index { index } - } - fn nested_arrow_schema() -> ArrowSchemaRef { Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), @@ -590,7 +591,11 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![rii(0), rii(1), rii(2)]; + let expect_reorder = vec![ + ReorderIndex::new_index(0), + ReorderIndex::new_index(1), + ReorderIndex::new_index(2), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -643,7 +648,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1]; - let expect_reorder = vec![rii(0)]; + let expect_reorder = vec![ReorderIndex::new_index(0)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -663,7 +668,11 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![rii(2), rii(0), rii(1)]; + let expect_reorder = vec![ + ReorderIndex::new_index(2), + ReorderIndex::new_index(0), + ReorderIndex::new_index(1), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -683,12 +692,9 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1]; let expect_reorder = vec![ - rii(0), - rii(2), - ReorderIndex::Missing { - index: 1, - field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), - }, + ReorderIndex::new_index(0), + ReorderIndex::new_index(2), + ReorderIndex::new_missing(1, Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true))), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -713,12 +719,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - rii(0), - ReorderIndex::Child { - index: 1, - children: vec![rii(0), rii(1)], - }, - rii(2), + ReorderIndex::new_index(0), + ReorderIndex::new_child( + 1, + vec![ReorderIndex::new_index(0), ReorderIndex::new_index(1)], + ), + ReorderIndex::new_index(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -743,12 +749,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - rii(2), - ReorderIndex::Child { - index: 0, - children: vec![rii(1), rii(0)], - }, - rii(1), + ReorderIndex::new_index(2), + ReorderIndex::new_child( + 0, + vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + ), + ReorderIndex::new_index(1), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -770,12 +776,9 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - rii(0), - ReorderIndex::Child { - index: 1, - children: vec![rii(0)], - }, - rii(2), + ReorderIndex::new_index(0), + ReorderIndex::new_child(1, vec![ReorderIndex::new_index(0)]), + ReorderIndex::new_index(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -804,7 +807,11 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2]; - let expect_reorder = vec![rii(0), rii(1), rii(2)]; + let expect_reorder = vec![ + ReorderIndex::new_index(0), + ReorderIndex::new_index(1), + ReorderIndex::new_index(2), + ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -850,12 +857,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - rii(0), - ReorderIndex::Child { - index: 1, - children: vec![rii(0), rii(1)], - }, - rii(2), + ReorderIndex::new_index(0), + ReorderIndex::new_child( + 1, + vec![ReorderIndex::new_index(0), ReorderIndex::new_index(1)], + ), + ReorderIndex::new_index(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -899,12 +906,9 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - rii(0), - ReorderIndex::Child { - index: 1, - children: vec![rii(0)], - }, - rii(2), + ReorderIndex::new_index(0), + ReorderIndex::new_child(1, vec![ReorderIndex::new_index(0)]), + ReorderIndex::new_index(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -952,12 +956,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![0, 2, 3, 4]; let expect_reorder = vec![ - rii(0), - ReorderIndex::Child { - index: 1, - children: vec![rii(1), rii(0)], - }, - rii(2), + ReorderIndex::new_index(0), + ReorderIndex::new_child( + 1, + vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + ), + ReorderIndex::new_index(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1007,12 +1011,12 @@ mod tests { get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); let expect_mask = vec![2, 3, 4, 5]; let expect_reorder = vec![ - rii(2), - ReorderIndex::Child { - index: 1, - children: vec![rii(0), rii(1)], - }, - rii(0), + ReorderIndex::new_index(2), + ReorderIndex::new_child( + 1, + vec![ReorderIndex::new_index(0), ReorderIndex::new_index(1)], + ), + ReorderIndex::new_index(0), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1036,7 +1040,7 @@ mod tests { #[test] fn simple_reorder_struct() { let arry = make_struct_array(); - let reorder = vec![rii(1), rii(0)]; + let reorder = vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)]; let ordered = reorder_struct_array(arry, &reorder).unwrap(); assert_eq!(ordered.column_names(), vec!["c", "b"]); } @@ -1069,21 +1073,21 @@ mod tests { ), ]); let reorder = vec![ - ReorderIndex::Child { - index: 1, - children: vec![rii(1), rii(0)], - }, - ReorderIndex::Child { - index: 0, - children: vec![ - rii(0), - rii(1), - ReorderIndex::Missing { - index: 2, - field: Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), - }, + ReorderIndex::new_child( + 1, + vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + ), + ReorderIndex::new_child( + 0, + vec![ + ReorderIndex::new_index(0), + ReorderIndex::new_index(1), + ReorderIndex::new_missing( + 2, + Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), + ), ], - }, + ), ]; let ordered = reorder_struct_array(nested, &reorder).unwrap(); assert_eq!(ordered.column_names(), vec!["struct2", "struct1"]); @@ -1128,10 +1132,10 @@ mod tests { false, )); let struct_array = StructArray::from(vec![(list_dt, list as ArrowArrayRef)]); - let reorder = vec![ReorderIndex::Child { - index: 0, - children: vec![rii(1), rii(0)], - }]; + let reorder = vec![ReorderIndex::new_child( + 0, + vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + )]; let ordered = reorder_struct_array(struct_array, &reorder).unwrap(); let ordered_list_col = ordered.column(0).as_list::(); for i in 0..ordered_list_col.len() { From c04082779759aa3425c9a34401bdf694682129eb Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 16:25:43 -0700 Subject: [PATCH 31/54] some cleanup --- kernel/src/engine/arrow_utils.rs | 90 ++++++++++++++++---------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 76881a1b6..c6e4fbe68 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -42,7 +42,7 @@ pub(crate) fn ensure_data_types( } else { Err(make_arrow_error(format!( "Incorrect datatype. Expected {}, got {}", - kernel_type, arrow_type + converted_type, arrow_type ))) } } @@ -354,7 +354,7 @@ fn get_indices( } /// Get the indices in `parquet_schema` of the specified columns in `requested_schema`. This returns -/// a tuples of (mask_indices: Vec, reorder_indices: +/// a tuple of (mask_indices: Vec, reorder_indices: /// Vec). `mask_indices` is used for generating the mask for reading from the /// parquet file, and simply contains an entry for each index we wish to select from the parquet /// file set to the index of the requested column in the parquet. `reorder_indices` is used for @@ -558,7 +558,7 @@ mod tests { use super::{get_requested_indices, reorder_struct_array, ReorderIndex}; - fn nested_arrow_schema() -> ArrowSchemaRef { + fn nested_parquet_schema() -> ArrowSchemaRef { Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new( @@ -578,18 +578,18 @@ mod tests { #[test] fn simple_mask_indices() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new("s", DataType::STRING, true), StructField::new("i2", DataType::INTEGER, true), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new("s", ArrowDataType::Utf8, true), ArrowField::new("i2", ArrowDataType::Int32, true), ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -602,33 +602,33 @@ mod tests { #[test] fn ensure_data_types_fails_correctly() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new("s", DataType::INTEGER, true), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new("s", ArrowDataType::Utf8, true), ])); - let res = get_requested_indices(&kernel_schema, &arrow_schema); + let res = get_requested_indices(&requested_schema, &parquet_schema); assert!(res.is_err()); - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new("s", DataType::STRING, true), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new("s", ArrowDataType::Int32, true), ])); - let res = get_requested_indices(&kernel_schema, &arrow_schema); + let res = get_requested_indices(&requested_schema, &parquet_schema); println!("{res:#?}"); assert!(res.is_err()); } #[test] fn mask_with_map() { - let kernel_schema = Arc::new(StructType::new(vec![StructField::new( + let requested_schema = Arc::new(StructType::new(vec![StructField::new( "map", DataType::Map(Box::new(MapType::new( DataType::INTEGER, @@ -637,7 +637,7 @@ mod tests { ))), false, )])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new_map( + let parquet_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new_map( "map", "entries", ArrowField::new("i", ArrowDataType::Int32, false), @@ -646,7 +646,7 @@ mod tests { false, )])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; let expect_reorder = vec![ReorderIndex::new_index(0)]; assert_eq!(mask_indices, expect_mask); @@ -655,18 +655,18 @@ mod tests { #[test] fn simple_reorder_indices() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new("s", DataType::STRING, true), StructField::new("i2", DataType::INTEGER, true), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i2", ArrowDataType::Int32, true), ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new("s", ArrowDataType::Utf8, true), ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ ReorderIndex::new_index(2), @@ -679,17 +679,17 @@ mod tests { #[test] fn simple_nullable_field_missing() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new("s", DataType::STRING, true), StructField::new("i2", DataType::INTEGER, true), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new("i2", ArrowDataType::Int32, true), ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -702,7 +702,7 @@ mod tests { #[test] fn nested_indices() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new( "nested", @@ -714,9 +714,9 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = nested_arrow_schema(); + let parquet_schema = nested_parquet_schema(); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -732,7 +732,7 @@ mod tests { #[test] fn nested_indices_reorder() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new( "nested", StructType::new(vec![ @@ -744,9 +744,9 @@ mod tests { StructField::new("j", DataType::INTEGER, false), StructField::new("i", DataType::INTEGER, false), ])); - let arrow_schema = nested_arrow_schema(); + let parquet_schema = nested_parquet_schema(); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ ReorderIndex::new_index(2), @@ -762,7 +762,7 @@ mod tests { #[test] fn nested_indices_mask_inner() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new( "nested", @@ -771,9 +771,9 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = nested_arrow_schema(); + let parquet_schema = nested_parquet_schema(); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -786,12 +786,12 @@ mod tests { #[test] fn simple_list_mask() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new("list", ArrayType::new(DataType::INTEGER, false), false), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new( "list", @@ -805,7 +805,7 @@ mod tests { ArrowField::new("j", ArrowDataType::Int32, false), ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -818,7 +818,7 @@ mod tests { #[test] fn nested_indices_list() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new( "list", @@ -834,7 +834,7 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new( "list", @@ -854,7 +854,7 @@ mod tests { ArrowField::new("j", ArrowDataType::Int32, false), ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -870,7 +870,7 @@ mod tests { #[test] fn nested_indices_list_mask_inner() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new( "list", @@ -883,7 +883,7 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), ArrowField::new( "list", @@ -903,7 +903,7 @@ mod tests { ArrowField::new("j", ArrowDataType::Int32, false), ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -916,7 +916,7 @@ mod tests { #[test] fn nested_indices_list_mask_inner_reorder() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new( "list", @@ -932,7 +932,7 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i", ArrowDataType::Int32, false), // field 0 ArrowField::new( "list", @@ -953,7 +953,7 @@ mod tests { ArrowField::new("j", ArrowDataType::Int32, false), // field 4 ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 2, 3, 4]; let expect_reorder = vec![ ReorderIndex::new_index(0), @@ -969,7 +969,7 @@ mod tests { #[test] fn skipped_struct() { - let kernel_schema = Arc::new(StructType::new(vec![ + let requested_schema = Arc::new(StructType::new(vec![ StructField::new("i", DataType::INTEGER, false), StructField::new( "nested", @@ -981,7 +981,7 @@ mod tests { ), StructField::new("j", DataType::INTEGER, false), ])); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ + let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new( "skipped", ArrowDataType::Struct( @@ -1008,7 +1008,7 @@ mod tests { ArrowField::new("i", ArrowDataType::Int32, false), ])); let (mask_indices, reorder_indices) = - get_requested_indices(&kernel_schema, &arrow_schema).unwrap(); + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![2, 3, 4, 5]; let expect_reorder = vec![ ReorderIndex::new_index(2), From b54a79144e6f90b8f21eaa66ead12c2b74ca64cd Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 16:42:29 -0700 Subject: [PATCH 32/54] add timestamp special case + no_matches test --- kernel/src/engine/arrow_utils.rs | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index c6e4fbe68..07f8326a0 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -35,6 +35,16 @@ pub(crate) fn ensure_data_types( arrow_type: &ArrowDataType, ) -> DeltaResult<()> { match (kernel_type, arrow_type) { + (DataType::Primitive(PrimitiveType::Timestamp), ArrowDataType::Timestamp(_, _)) + | (DataType::Primitive(PrimitiveType::TimestampNtz), ArrowDataType::Timestamp(_, _)) => { + // We assume that any timestamp data read from a delta table is correctly written in + // microseconds and with the right timezone info. there seems to be an issue at least on + // MacOS where the parquet crate reports `Timestamp(Nanoseconds, None)` even though the + // parquet footer indicates `timeUnit=microseconds` and `isAdjustedToUTC=true`. Will + // follow-up upstream to see if this is a bug in the parquet crate. + // TODO: FILL IN ISSUE NUMBER(s) + Ok(()) + } (DataType::Primitive(_), _) if arrow_type.is_primitive() => { let converted_type: ArrowDataType = kernel_type.try_into()?; if &converted_type == arrow_type { @@ -1144,4 +1154,27 @@ mod tests { assert_eq!(struct_item.column_names(), vec!["c", "b"]); } } + + #[test] + fn no_matches() { + let requested_schema = Arc::new(StructType::new(vec![ + StructField::new("s", DataType::STRING, true), + StructField::new("i2", DataType::INTEGER, true), + ])); + let nots_field = ArrowField::new("NOTs", ArrowDataType::Utf8, true); + let noti2_field = ArrowField::new("NOTi2", ArrowDataType::Int32, true); + let parquet_schema = Arc::new(ArrowSchema::new(vec![ + nots_field.clone(), + noti2_field.clone(), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); + let expect_mask: Vec = vec![]; + let expect_reorder = vec![ + ReorderIndex::new_missing(0, nots_field.with_name("s").into()), + ReorderIndex::new_missing(1, noti2_field.with_name("i2").into()), + ]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } } From dfd662f834f4e8e884760b45af824551722a2491 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 16:51:39 -0700 Subject: [PATCH 33/54] Add empty_requested_schema test --- kernel/src/engine/arrow_utils.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 07f8326a0..40ad31e52 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -1177,4 +1177,20 @@ mod tests { assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } + + #[test] + fn empty_requested_schema() { + let requested_schema = Arc::new(StructType::new(vec![])); + let parquet_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("s", ArrowDataType::Utf8, true), + ArrowField::new("i2", ArrowDataType::Int32, true), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); + let expect_mask: Vec = vec![]; + let expect_reorder = vec![]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } } From 1b372a54fdfa514adb2a1067b06b4d6c9f31e1d9 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 17:08:17 -0700 Subject: [PATCH 34/54] flatten and unzip --- kernel/src/engine/arrow_utils.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 40ad31e52..ca6a80415 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -496,16 +496,13 @@ pub(crate) fn reorder_struct_array( } } } - let mut field_vec = Vec::with_capacity(num_cols); - let mut reordered_columns = Vec::with_capacity(num_cols); - for field_array_opt in final_fields_cols.into_iter() { - let (field, array) = field_array_opt.ok_or_else(|| { - Error::generic( - "Found a None in final_fields_cols. This is a kernel bug, please report.", - ) - })?; - field_vec.push(field); - reordered_columns.push(array); + let num_cols = final_fields_cols.len(); + let (field_vec, reordered_columns): (Vec>, _) = + final_fields_cols.into_iter().flatten().unzip(); + if field_vec.len() != num_cols { + return Err(Error::generic( + "Found a None in final_fields_cols. This is a kernel bug, please report.", + )); } Ok(StructArray::try_new( field_vec.into(), From c35510077991524b43cb3ab64d2a2052caacf5b0 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Wed, 10 Jul 2024 17:51:26 -0700 Subject: [PATCH 35/54] add a giant doc comment --- kernel/src/engine/arrow_utils.rs | 90 ++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index ca6a80415..979acc0b7 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -130,6 +130,96 @@ pub(crate) fn ensure_data_types( } } +/* +* The code below implements proper pruning of columns when reading parquet, reordering of columns to +* match the specified schema, and insertion of null columns if the requested schema includes a +* nullable column that isn't included in the parquet file. +* +* At a high level there are three schemas/concepts to worry about: +* - The parquet file's physical schema (= the columns that are actually available), called +* "parquet_schema" below +* - The requested logical schema from the engine (= the columns we actually want, a superset of +* the read schema), called "requested_schema" below +* - A `ProjectionMask` that goes to the parquet reader which specifies which subset of columns from +* the file schema to actually read. (See "Example" below) +* +* In other words, the ProjectionMask is the intersection of file schema and logical schema, and then +* mapped to indices in the parquet file. Columns unique to the file schema need to be masked out (= +* ignored), while columns unique to the logical schema need to be backfilled with nulls. +* +* We also have to worry about field ordering differences between read schema and logical schema. We +* represent any reordering needed as a tree. Each level of the tree is a vec of +* `ReorderIndex`s. Each element's index represents a column that will be in the read parquet data +* (as an arrow StructArray) at that level and index. The `ReorderIndex::index` field of the element +* is the position that the column should appear in the final output. + +* The algorithm has three parts, handled by `get_requested_indices`, `generate_mask` and +* `reorder_struct_array` respectively. + +* `get_requested_indices` generates indices to select and reordering information: +* 1. Loop over each field in parquet_schema, keeping track of how many physical fields (i.e. actual +* stored columns) we have seen so far +* 2. If a requested field matches the physical field, push the index of the field onto the mask. +* 3. Also push a ReorderIndex element that indicates where this item should be in the final output. +* 4. If a nested element (struct/map/list) is encountered, recurse into it, pushing indices onto the +* same vector, but producing a new reorder level, which is added to the parent with a `Child` kind +* +* `generate_mask` is simple, and just calls `ProjectionMask::leaves` in the parquet crate with the +* indices computed by `get_requested_indices` +* +* `reorder_struct_array` handles reordering: +* 1. First check if we're already in order (see doc comment for `is_ordered`) +* 2. If ordered we're done, return, otherwise: +* 3. Create a Vec[None, ..., None] of placeholders that will hold the correctly ordered columns +* 4. Deconstruct the existing struct array and then loop over the `ReorderIndex` list +* 5. If the `kind` is Index: put the column at the correct location +* 6. If the `kind` is Missing: put a column of `null` at the correct location +* 7. If the `kind` is Child([child_order]) and the data is a `StructArray` o, recursively call +* `reorder_struct_array` on the column with `child_order` and put the resulting, now correctly +* ordered array, at the correct location +* 8. If the `kind` is Child and the data is a `List`, get the inner struct array out of +* the list, reorder it recursively as above, rebuild the list, and the put the column at the +* correct location +* +* Example: +* The parquet crate treats columns being actually "flat", so a struct column is purely a schema +* level thing and doesn't "count" wrt. column indices. +* +* So if we have the following file physical schema: +* +* a +* d +* x +* b +* y +* z +* e +* f +* c +* +* and a logical requested schema of: +* +* b +* f +* e +* a +* x +* c +* +* The mask is [1, 3, 4, 5] because a, b, and y don't contribute to the column indices. +* +* The reorder tree is: +* [ +* // col a is at position 0 in the struct array, and should be moved to position 1 +* { index: 1, Child([{ index: 0 }]) }, +* // col b is at position 1 in the struct array, and should be moved to position 0 +* // also, the inner struct array needs to be reordered to swap 'f' and 'e' +* { index: 0, Child([{ index: 1 }, {index: 0}]) }, +* // col c is at position 2 in the struct array, and should stay there +* { index: 2 } +* ] +*/ + /// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index /// represents a column that will be in the read parquet data at that level and index. The `index()` /// of the element is the position that the column should appear in the final output. If it is a From 6b92d7489b33d0266f314ab93bed3d5709cef0ef Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Thu, 11 Jul 2024 17:33:02 -0700 Subject: [PATCH 36/54] undo the horrible hack, bump dat version --- acceptance/build.rs | 2 +- kernel/src/engine/arrow_utils.rs | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/acceptance/build.rs b/acceptance/build.rs index 32a71399f..23d5a2b1a 100644 --- a/acceptance/build.rs +++ b/acceptance/build.rs @@ -9,7 +9,7 @@ use tar::Archive; const DAT_EXISTS_FILE_CHECK: &str = "tests/dat/.done"; const OUTPUT_FOLDER: &str = "tests/dat"; -const VERSION: &str = "0.0.2"; +const VERSION: &str = "0.0.3"; fn main() { if dat_exists() { diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 979acc0b7..98c0e5082 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -35,16 +35,6 @@ pub(crate) fn ensure_data_types( arrow_type: &ArrowDataType, ) -> DeltaResult<()> { match (kernel_type, arrow_type) { - (DataType::Primitive(PrimitiveType::Timestamp), ArrowDataType::Timestamp(_, _)) - | (DataType::Primitive(PrimitiveType::TimestampNtz), ArrowDataType::Timestamp(_, _)) => { - // We assume that any timestamp data read from a delta table is correctly written in - // microseconds and with the right timezone info. there seems to be an issue at least on - // MacOS where the parquet crate reports `Timestamp(Nanoseconds, None)` even though the - // parquet footer indicates `timeUnit=microseconds` and `isAdjustedToUTC=true`. Will - // follow-up upstream to see if this is a bug in the parquet crate. - // TODO: FILL IN ISSUE NUMBER(s) - Ok(()) - } (DataType::Primitive(_), _) if arrow_type.is_primitive() => { let converted_type: ArrowDataType = kernel_type.try_into()?; if &converted_type == arrow_type { From 56fd9d481ab428db31c30671aa1e42b94d05f1f1 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 12 Jul 2024 13:20:43 -0700 Subject: [PATCH 37/54] handle timestamp conversions --- Cargo.toml | 1 + kernel/Cargo.toml | 5 +- kernel/src/engine/arrow_utils.rs | 125 ++++++++++++++++++++++--------- kernel/tests/read.rs | 22 +++--- 4 files changed, 107 insertions(+), 46 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 381bf2199..cde0af3c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ version = "0.1.1" arrow = { version = "^52.0" } arrow-arith = { version = "^52.0" } arrow-array = { version = "^52.0" } +arrow-cast = { version = "^52.0" } arrow-data = { version = "^52.0" } arrow-ord = { version = "^52.0" } arrow-json = { version = "^52.0" } diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index 601feb0ed..82fd64c08 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -37,9 +37,10 @@ delta_kernel_derive = { path = "../derive-macros", version = "0.1.1" } visibility = "0.1.0" # Used in default engine -arrow-array = { workspace = true, optional = true } +arrow-array = { workspace = true, optional = true, features = ["chrono-tz"] } arrow-select = { workspace = true, optional = true } arrow-arith = { workspace = true, optional = true } +arrow-cast = { workspace = true, optional = true } arrow-json = { workspace = true, optional = true } arrow-ord = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } @@ -67,6 +68,7 @@ default-engine = [ "arrow-conversion", "arrow-expression", "arrow-array", + "arrow-cast", "arrow-json", "arrow-schema", "arrow-select", @@ -80,6 +82,7 @@ default-engine = [ developer-visibility = [] sync-engine = [ + "arrow-cast", "arrow-conversion", "arrow-expression", "arrow-array", diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 98c0e5082..49a4e6b9e 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -24,6 +24,34 @@ fn make_arrow_error(s: String) -> Error { Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(s)) } +/// Capture the compatibility between two data-types, as passed to [`ensure_data_types`] +pub(crate) enum DataTypeCompat { + /// The two types are the same + Identical, + /// What is read from parquet needs to be cast to the associated type + NeedsCast(ArrowDataType), + /// Types are compatible, but are nested types. This is used when comparing types where casting + /// is not desired (i.e. in the expression evaluator) + Nested, +} + +// Check if two types can be cast +fn check_cast_compat( + source_type: &ArrowDataType, + target_type: ArrowDataType, +) -> DeltaResult { + match (source_type, &target_type) { + (&ArrowDataType::Timestamp(_, _), &ArrowDataType::Timestamp(_, _)) => { + // timestamps are able to be cast between each other + Ok(DataTypeCompat::NeedsCast(target_type)) + } + _ => Err(make_arrow_error(format!( + "Incorrect datatype. Expected {}, got {}", + target_type, source_type + ))), //| (DataType::Primitive(PrimitiveType::TimestampNtz), ArrowDataType::Timestamp(_, _)) => { + } +} + /// Ensure a kernel data type matches an arrow data type. This only ensures that the actual "type" /// is the same, but does so recursively into structs, and ensures lists and maps have the correct /// associated types as well. This returns an `Ok(())` if the types are compatible, or an error if @@ -33,31 +61,28 @@ fn make_arrow_error(s: String) -> Error { pub(crate) fn ensure_data_types( kernel_type: &DataType, arrow_type: &ArrowDataType, -) -> DeltaResult<()> { +) -> DeltaResult { match (kernel_type, arrow_type) { (DataType::Primitive(_), _) if arrow_type.is_primitive() => { let converted_type: ArrowDataType = kernel_type.try_into()?; if &converted_type == arrow_type { - Ok(()) + Ok(DataTypeCompat::Identical) } else { - Err(make_arrow_error(format!( - "Incorrect datatype. Expected {}, got {}", - converted_type, arrow_type - ))) + check_cast_compat(arrow_type, converted_type) } } (DataType::Primitive(PrimitiveType::Boolean), ArrowDataType::Boolean) | (DataType::Primitive(PrimitiveType::String), ArrowDataType::Utf8) | (DataType::Primitive(PrimitiveType::Binary), ArrowDataType::Binary) => { // strings, bools, and binary aren't primitive in arrow - Ok(()) + Ok(DataTypeCompat::Identical) } ( DataType::Primitive(PrimitiveType::Decimal(kernel_prec, kernel_scale)), ArrowDataType::Decimal128(arrow_prec, arrow_scale), ) if arrow_prec == kernel_prec && *arrow_scale == *kernel_scale as i8 => { // decimal isn't primitive in arrow. cast above is okay as we limit range - Ok(()) + Ok(DataTypeCompat::Identical) } (DataType::Array(inner_type), ArrowDataType::List(arrow_list_type)) => { let kernel_array_type = &inner_type.element_type; @@ -81,7 +106,7 @@ pub(crate) fn ensure_data_types( "Arrow map struct didn't have a value type".to_string(), )); } - Ok(()) + Ok(DataTypeCompat::Nested) } else { Err(make_arrow_error( "Arrow map type wasn't a struct.".to_string(), @@ -111,7 +136,7 @@ pub(crate) fn ensure_data_types( kernel_field_names, arrow_field_names, )) }); - Ok(()) + Ok(DataTypeCompat::Nested) } _ => Err(make_arrow_error(format!( "Incorrect datatype. Expected {}, got {}", @@ -211,52 +236,65 @@ pub(crate) fn ensure_data_types( */ /// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index -/// represents a column that will be in the read parquet data at that level and index. The `index()` -/// of the element is the position that the column should appear in the final output. If it is a -/// `Child` variant, then at that index there is a `Struct` whose ordering is specified by the -/// values in the associated `Vec` according to these same rules. +/// represents a column that will be in the read parquet data at that level and index. The `index` +/// of the element is the position that the column should appear in the final output. The `transform` +/// indicates what, if any, transforms are needed. See the docs for [`ReorderIndexTransform`] for the +/// meaning. #[derive(Debug, PartialEq)] pub(crate) struct ReorderIndex { pub(crate) index: usize, - kind: ReorderIndexKind, + transform: ReorderIndexTransform, } #[derive(Debug, PartialEq)] -pub(crate) enum ReorderIndexKind { +pub(crate) enum ReorderIndexTransform { + /// For a non-nested type, indicates that we need to cast to the contained type + Cast(ArrowDataType), + /// Used for struct/list/map. Potentially transform child fields using contained reordering Child(Vec), - Index, + /// No work needed to transform this data + None, + /// Data is missing, fill in with a null column Missing(ArrowFieldRef), } impl ReorderIndex { + fn new_cast(index: usize, target: ArrowDataType) -> Self { + ReorderIndex { + index, + transform: ReorderIndexTransform::Cast(target), + } + } + fn new_child(index: usize, children: Vec) -> Self { ReorderIndex { index, - kind: ReorderIndexKind::Child(children), + transform: ReorderIndexTransform::Child(children), } } fn new_index(index: usize) -> Self { ReorderIndex { index, - kind: ReorderIndexKind::Index, + transform: ReorderIndexTransform::None, } } fn new_missing(index: usize, field: ArrowFieldRef) -> Self { ReorderIndex { index, - kind: ReorderIndexKind::Missing(field), + transform: ReorderIndexTransform::Missing(field), } } - /// Check if this reordering contains a `Missing` variant anywhere. See comment below on + /// Check if this reordering requires a transformation anywhere. See comment below on /// [`is_ordered`] to understand why this is needed. - fn contains_missing(&self) -> bool { - match self.kind { - ReorderIndexKind::Child(ref children) => is_ordered(children), - ReorderIndexKind::Index => true, - ReorderIndexKind::Missing(_) => false, + fn needs_transform(&self) -> bool { + match self.transform { + ReorderIndexTransform::Cast(_) => true, + ReorderIndexTransform::Child(ref children) => is_ordered(children), + ReorderIndexTransform::None => true, + ReorderIndexTransform::Missing(_) => false, } } } @@ -410,10 +448,18 @@ fn get_indices( if let Some((index, _, requested_field)) = requested_schema.fields.get_full(field.name()) { - ensure_data_types(&requested_field.data_type, field.data_type())?; + match ensure_data_types(&requested_field.data_type, field.data_type())? { + DataTypeCompat::Identical => + reorder_indices.push(ReorderIndex::new_index(index)), + DataTypeCompat::NeedsCast(target) => + reorder_indices.push(ReorderIndex::new_cast(index, target)), + DataTypeCompat::Nested => return + Err(Error::generic( + "Comparing nested types in get_indices. This is a kernel bug, please report" + )) + } found_fields.insert(requested_field.name()); mask_indices.push(parquet_offset + parquet_index); - reorder_indices.push(ReorderIndex::new_index(index)); } } } @@ -491,13 +537,13 @@ fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { return true; } // we have >=1 element. check that the first element is ordered - if !requested_ordering[0].contains_missing() { + if !requested_ordering[0].needs_transform() { return false; } // now check that all elements are ordered wrt. each other, and are internally ordered requested_ordering .windows(2) - .all(|ri| (ri[0].index < ri[1].index) && ri[1].contains_missing()) + .all(|ri| (ri[0].index < ri[1].index) && !ri[1].needs_transform()) } // we use this as a placeholder for an array and its associated field. We can fill in a Vec of None @@ -524,8 +570,19 @@ pub(crate) fn reorder_struct_array( for (parquet_position, reorder_index) in requested_ordering.iter().enumerate() { // for each item, reorder_index.index() tells us where to put it, and its position in // requested_ordering tells us where it is in the parquet data - match &reorder_index.kind { - ReorderIndexKind::Child(children) => { + match &reorder_index.transform { + ReorderIndexTransform::Cast(target) => { + let source_col = input_cols[parquet_position].as_ref(); + let new_col = Arc::new(arrow_cast::cast::cast(source_col, target)?); + let new_field = Arc::new( + input_fields[parquet_position] + .as_ref() + .clone() + .with_data_type(new_col.data_type().clone()), + ); + final_fields_cols[reorder_index.index] = Some((new_field, new_col)); + } + ReorderIndexTransform::Child(children) => { match input_cols[parquet_position].data_type() { ArrowDataType::Struct(_) => { let struct_array = input_cols[parquet_position].as_struct().clone(); @@ -563,13 +620,13 @@ pub(crate) fn reorder_struct_array( } } } - ReorderIndexKind::Index => { + ReorderIndexTransform::None => { final_fields_cols[reorder_index.index] = Some(( input_fields[parquet_position].clone(), // cheap Arc clone input_cols[parquet_position].clone(), // cheap Arc clone )); } - ReorderIndexKind::Missing(field) => { + ReorderIndexTransform::Missing(field) => { let null_array = Arc::new(new_null_array(field.data_type(), num_rows)); let field = field.clone(); // cheap Arc clone final_fields_cols[reorder_index.index] = Some((field, null_array)); diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 275b18791..46748304b 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -987,17 +987,17 @@ fn with_predicate_and_removes() -> Result<(), Box> { #[test] fn short_dv() -> Result<(), Box> { let expected = vec![ - "+----+-------+-------------------------+---------------------+", - "| id | value | timestamp | rand |", - "+----+-------+-------------------------+---------------------+", - "| 3 | 3 | 2023-05-31T18:58:33.633 | 0.7918174793484931 |", - "| 4 | 4 | 2023-05-31T18:58:33.633 | 0.9281049271981882 |", - "| 5 | 5 | 2023-05-31T18:58:33.633 | 0.27796520310701633 |", - "| 6 | 6 | 2023-05-31T18:58:33.633 | 0.15263801464228832 |", - "| 7 | 7 | 2023-05-31T18:58:33.633 | 0.1981143710215575 |", - "| 8 | 8 | 2023-05-31T18:58:33.633 | 0.3069439236599195 |", - "| 9 | 9 | 2023-05-31T18:58:33.633 | 0.5175919190815845 |", - "+----+-------+-------------------------+---------------------+", + "+----+-------+--------------------------+---------------------+", + "| id | value | timestamp | rand |", + "+----+-------+--------------------------+---------------------+", + "| 3 | 3 | 2023-05-31T18:58:33.633Z | 0.7918174793484931 |", + "| 4 | 4 | 2023-05-31T18:58:33.633Z | 0.9281049271981882 |", + "| 5 | 5 | 2023-05-31T18:58:33.633Z | 0.27796520310701633 |", + "| 6 | 6 | 2023-05-31T18:58:33.633Z | 0.15263801464228832 |", + "| 7 | 7 | 2023-05-31T18:58:33.633Z | 0.1981143710215575 |", + "| 8 | 8 | 2023-05-31T18:58:33.633Z | 0.3069439236599195 |", + "| 9 | 9 | 2023-05-31T18:58:33.633Z | 0.5175919190815845 |", + "+----+-------+--------------------------+---------------------+", ]; read_table_data_str("./tests/data/with-short-dv/", None, None, expected)?; Ok(()) From cda283430133cbc97c8678d424a0cc87e79795d5 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 12 Jul 2024 13:25:56 -0700 Subject: [PATCH 38/54] add basic_decimal test --- kernel/tests/read.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 46748304b..c41bc587d 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -1002,3 +1002,19 @@ fn short_dv() -> Result<(), Box> { read_table_data_str("./tests/data/with-short-dv/", None, None, expected)?; Ok(()) } + +#[test] +fn basic_decimal() -> Result<(), Box> { + let expected = vec![ + "+----------------+---------+--------------+------------------------+", + "| part | col1 | col2 | col3 |", + "+----------------+---------+--------------+------------------------+", + "| -2342342.23423 | -999.99 | -99999.99999 | -9999999999.9999999999 |", + "| 0.00004 | 0.00 | 0.00000 | 0.0000000000 |", + "| 234.00000 | 1.00 | 2.00000 | 3.0000000000 |", + "| 2342222.23454 | 111.11 | 22222.22222 | 3333333333.3333333333 |", + "+----------------+---------+--------------+------------------------+", + ]; + read_table_data_str("./tests/data/basic-decimal-table/", None, None, expected)?; + Ok(()) +} From dd127a40f56cb37f4ff0c16b0336625ed1c1d75b Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 12 Jul 2024 13:51:30 -0700 Subject: [PATCH 39/54] add timestamp_ntz test, fix test bug --- kernel/tests/read.rs | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index c41bc587d..b87a80625 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -7,6 +7,7 @@ use arrow::array::{ArrayRef, Int32Array, StringArray}; use arrow::compute::filter_record_batch; use arrow::error::ArrowError; use arrow::record_batch::RecordBatch; +use arrow_schema::{SchemaRef as ArrowSchemaRef}; use arrow_select::concat::concat_batches; use delta_kernel::actions::deletion_vector::split_vector; use delta_kernel::engine::arrow_data::ArrowEngineData; @@ -399,6 +400,7 @@ fn read_with_execute( scan: &Scan, expected: &[String], ) -> Result<(), Box> { + let result_schema: ArrowSchemaRef = Arc::new(scan.schema().as_ref().try_into()?); let scan_results = scan.execute(engine)?; let batches: Vec = scan_results .into_iter() @@ -421,8 +423,7 @@ fn read_with_execute( if expected.is_empty() { assert_eq!(batches.len(), 0); } else { - let schema = batches[0].schema(); - let batch = concat_batches(&schema, &batches)?; + let batch = concat_batches(&result_schema, &batches)?; assert_batches_sorted_eq!(expected, &[batch]); } Ok(()) @@ -458,6 +459,7 @@ fn read_with_scan_data( expected: &[String], ) -> Result<(), Box> { let global_state = scan.global_scan_state(); + let result_schema: ArrowSchemaRef = Arc::new(scan.schema().as_ref().try_into()?); let scan_data = scan.scan_data(engine)?; let mut scan_files = vec![]; for data in scan_data { @@ -511,8 +513,7 @@ fn read_with_scan_data( if expected.is_empty() { assert_eq!(batches.len(), 0); } else { - let schema = batches[0].schema(); - let batch = concat_batches(&schema, &batches)?; + let batch = concat_batches(&result_schema, &batches)?; assert_batches_sorted_eq!(expected, &[batch]); } Ok(()) @@ -1018,3 +1019,24 @@ fn basic_decimal() -> Result<(), Box> { read_table_data_str("./tests/data/basic-decimal-table/", None, None, expected)?; Ok(()) } + +#[test] +fn timestamp_ntz() -> Result<(), Box> { + let expected = vec![ + "+----+----------------------------+----------------------------+", + "| id | tsNtz | tsNtzPartition |", + "+----+----------------------------+----------------------------+", + "| 0 | 2021-11-18T02:30:00.123456 | 2021-11-18T02:30:00.123456 |", + "| 1 | 2013-07-05T17:01:00.123456 | 2021-11-18T02:30:00.123456 |", + "| 2 | | 2021-11-18T02:30:00.123456 |", + "| 3 | 2021-11-18T02:30:00.123456 | 2013-07-05T17:01:00.123456 |", + "| 4 | 2013-07-05T17:01:00.123456 | 2013-07-05T17:01:00.123456 |", + "| 5 | | 2013-07-05T17:01:00.123456 |", + "| 6 | 2021-11-18T02:30:00.123456 | |", + "| 7 | 2013-07-05T17:01:00.123456 | |", + "| 8 | | |", + "+----+----------------------------+----------------------------+", + ]; + read_table_data_str("./tests/data/data-reader-timestamp_ntz/", None, None, expected)?; + Ok(()) +} From d772be550074af41c6ad7f80ce3da66aa878c93d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 12 Jul 2024 14:10:19 -0700 Subject: [PATCH 40/54] fmt + test fix --- acceptance/build.rs | 2 +- acceptance/src/data.rs | 11 ++++++----- kernel/src/engine/arrow_utils.rs | 2 +- kernel/tests/read.rs | 9 +++++++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/acceptance/build.rs b/acceptance/build.rs index 23d5a2b1a..32a71399f 100644 --- a/acceptance/build.rs +++ b/acceptance/build.rs @@ -9,7 +9,7 @@ use tar::Archive; const DAT_EXISTS_FILE_CHECK: &str = "tests/dat/.done"; const OUTPUT_FOLDER: &str = "tests/dat"; -const VERSION: &str = "0.0.3"; +const VERSION: &str = "0.0.2"; fn main() { if dat_exists() { diff --git a/acceptance/src/data.rs b/acceptance/src/data.rs index 871879ddc..87a774c92 100644 --- a/acceptance/src/data.rs +++ b/acceptance/src/data.rs @@ -60,11 +60,12 @@ pub fn sort_record_batch(batch: RecordBatch) -> DeltaResult { Ok(RecordBatch::try_new(batch.schema(), columns)?) } -static SKIPPED_TESTS: &[&str; 1] = &[ - // For multi_partitioned_2: The golden table stores the timestamp as an INT96 (which is - // nanosecond precision), while the spec says we should read partition columns as - // microseconds. This means the read and golden data don't line up. When this is released in - // `dat` upstream, we can stop skipping this test +static SKIPPED_TESTS: &[&str; 2] = &[ + // For all_primitive_types and multi_partitioned_2: The golden table stores the timestamp as an + // INT96 (which is nanosecond precision), while the spec says we should read partition columns + // as microseconds. This means the read and golden data don't line up. When this is released in + // `dat` upstream, we can stop skipping these tests + "all_primitive_types", "multi_partitioned_2", ]; diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 49a4e6b9e..e8fffb228 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -48,7 +48,7 @@ fn check_cast_compat( _ => Err(make_arrow_error(format!( "Incorrect datatype. Expected {}, got {}", target_type, source_type - ))), //| (DataType::Primitive(PrimitiveType::TimestampNtz), ArrowDataType::Timestamp(_, _)) => { + ))), } } diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index b87a80625..1b760fa84 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -7,7 +7,7 @@ use arrow::array::{ArrayRef, Int32Array, StringArray}; use arrow::compute::filter_record_batch; use arrow::error::ArrowError; use arrow::record_batch::RecordBatch; -use arrow_schema::{SchemaRef as ArrowSchemaRef}; +use arrow_schema::SchemaRef as ArrowSchemaRef; use arrow_select::concat::concat_batches; use delta_kernel::actions::deletion_vector::split_vector; use delta_kernel::engine::arrow_data::ArrowEngineData; @@ -1037,6 +1037,11 @@ fn timestamp_ntz() -> Result<(), Box> { "| 8 | | |", "+----+----------------------------+----------------------------+", ]; - read_table_data_str("./tests/data/data-reader-timestamp_ntz/", None, None, expected)?; + read_table_data_str( + "./tests/data/data-reader-timestamp_ntz/", + None, + None, + expected, + )?; Ok(()) } From b5157d75fdd3a21736a3c356b74b7f535f7b9504 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 12 Jul 2024 14:15:49 -0700 Subject: [PATCH 41/54] add new test data --- .../_delta_log/.00000000000000000000.json.crc | Bin 0 -> 32 bytes .../_delta_log/00000000000000000000.json | 7 +++++++ ...2c4-9d22-f83bc81c9b68.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...03-42c4-9d22-f83bc81c9b68.c000.snappy.parquet | Bin 0 -> 1032 bytes ...191-a318-ae9355f877c3.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...d4-4191-a318-ae9355f877c3.c000.snappy.parquet | Bin 0 -> 1033 bytes ...9af-947f-335a5e46ee5c.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...e5-49af-947f-335a5e46ee5c.c000.snappy.parquet | Bin 0 -> 1033 bytes ...d32-a9c0-7171a06547c6.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...d3-4d32-a9c0-7171a06547c6.c000.snappy.parquet | Bin 0 -> 1033 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 0 -> 16 bytes .../_delta_log/.00000000000000000001.json.crc | Bin 0 -> 28 bytes .../_delta_log/00000000000000000000.json | 3 +++ .../_delta_log/00000000000000000001.json | 5 +++++ ...49a-a1e6-0e24866d3508.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...bd9-b117-28d871bbb639.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...04-449a-a1e6-0e24866d3508.c000.snappy.parquet | Bin 0 -> 726 bytes ...02-4bd9-b117-28d871bbb639.c000.snappy.parquet | Bin 0 -> 730 bytes ...4f4-96ef-f43825143ba9.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...f3-44f4-96ef-f43825143ba9.c000.snappy.parquet | Bin 0 -> 742 bytes ...59a-921c-bb64bf0bbd03.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...73-459a-921c-bb64bf0bbd03.c000.snappy.parquet | Bin 0 -> 742 bytes 22 files changed, 15 insertions(+) create mode 100644 kernel/tests/data/basic-decimal-table/_delta_log/.00000000000000000000.json.crc create mode 100644 kernel/tests/data/basic-decimal-table/_delta_log/00000000000000000000.json create mode 100644 kernel/tests/data/basic-decimal-table/part=-2342342.23423/.part-00000-8f850371-9b03-42c4-9d22-f83bc81c9b68.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/basic-decimal-table/part=-2342342.23423/part-00000-8f850371-9b03-42c4-9d22-f83bc81c9b68.c000.snappy.parquet create mode 100644 kernel/tests/data/basic-decimal-table/part=0.00004/.part-00000-1cb60e36-6cd4-4191-a318-ae9355f877c3.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/basic-decimal-table/part=0.00004/part-00000-1cb60e36-6cd4-4191-a318-ae9355f877c3.c000.snappy.parquet create mode 100644 kernel/tests/data/basic-decimal-table/part=234.00000/.part-00000-ac109189-97e5-49af-947f-335a5e46ee5c.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/basic-decimal-table/part=234.00000/part-00000-ac109189-97e5-49af-947f-335a5e46ee5c.c000.snappy.parquet create mode 100644 kernel/tests/data/basic-decimal-table/part=2342222.23454/.part-00000-d5a0c70f-7cd3-4d32-a9c0-7171a06547c6.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/basic-decimal-table/part=2342222.23454/part-00000-d5a0c70f-7cd3-4d32-a9c0-7171a06547c6.c000.snappy.parquet create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/_delta_log/.00000000000000000000.json.crc create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/_delta_log/.00000000000000000001.json.crc create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000000.json create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000001.json create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/.part-00000-6240e68e-2304-449a-a1e6-0e24866d3508.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/.part-00001-336e3e5f-a202-4bd9-b117-28d871bbb639.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/part-00000-6240e68e-2304-449a-a1e6-0e24866d3508.c000.snappy.parquet create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/part-00001-336e3e5f-a202-4bd9-b117-28d871bbb639.c000.snappy.parquet create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2021-11-18 02%3A30%3A00.123456/.part-00000-65fcd5cb-f2f3-44f4-96ef-f43825143ba9.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2021-11-18 02%3A30%3A00.123456/part-00000-65fcd5cb-f2f3-44f4-96ef-f43825143ba9.c000.snappy.parquet create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=__HIVE_DEFAULT_PARTITION__/.part-00001-53fd3b3b-7773-459a-921c-bb64bf0bbd03.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=__HIVE_DEFAULT_PARTITION__/part-00001-53fd3b3b-7773-459a-921c-bb64bf0bbd03.c000.snappy.parquet diff --git a/kernel/tests/data/basic-decimal-table/_delta_log/.00000000000000000000.json.crc b/kernel/tests/data/basic-decimal-table/_delta_log/.00000000000000000000.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..9e29d16b151713e6c6d9538714cea223caefa053 GIT binary patch literal 32 ocmYc;N@ieSU}88rAFk+Ie`Vq literal 0 HcmV?d00001 diff --git a/kernel/tests/data/basic-decimal-table/part=-2342342.23423/part-00000-8f850371-9b03-42c4-9d22-f83bc81c9b68.c000.snappy.parquet b/kernel/tests/data/basic-decimal-table/part=-2342342.23423/part-00000-8f850371-9b03-42c4-9d22-f83bc81c9b68.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..63820edc4f98972a2bd1956cd51ddc2a48011fd3 GIT binary patch literal 1032 zcmb7^O^ee|6ozloT4EV0O3x*MASF1Vg&3NosihRbSqy`eQrxOYzB(4uk4aNyDE$fU z1c$+mYyAs?JHe%k`2oUgbTbUD1gD-fnMt4v5yHv2=Y8|!-Zzl`gGU+=C<6}{r?1bq zL|}1Q0FIF29hMN1W30oA70yJcz}M5aUdB;s#12OM_2JKI6}fJM1ZSVlzmzhz8?D&( z`%UZpw|8fR5pJ>qpO3i74OoE^vL17nFoC~nGxGHm!l!*=5Ge<{2qJ<=d(cg~V+Q(B zi>B`0)@*78FP_bu=ibPI0uHxXKJQF?ZMz70C@N2)8oBd{prcTPG74BDp#--ee#Stl z3{{k{qC{Ci6go18XBW%@;qY`38Cc5F9l9%Xo%qyorKGyfAYS2~F+SW6Vvcf9rau@^ ziQE$*pJC~1)*kK3y-u1>+9C5QwRDck)nM{U9eJVUTA{`NWEN6*IQ68qGz){d6H1CS7<#@tliH6L(%2eBQj|Xm ziP80(;mGo`{} z50en?j)F>(T18VeT{S9Idp`8t%An;so~yNVr{QP=8_(Kw9NW-ZrfJ->O{=bXP0wxG RHK8LC5?=&zg!J(r@E74r9033T literal 0 HcmV?d00001 diff --git a/kernel/tests/data/basic-decimal-table/part=0.00004/.part-00000-1cb60e36-6cd4-4191-a318-ae9355f877c3.c000.snappy.parquet.crc b/kernel/tests/data/basic-decimal-table/part=0.00004/.part-00000-1cb60e36-6cd4-4191-a318-ae9355f877c3.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..5646c512042d5cef1defc11d87116d1b805388c4 GIT binary patch literal 20 ccmYc;N@ieSU}7lT-?9DQo!WqgJbH18NQJ{fa3(_wK7IK0>MCPtbW2NNy9o+>dHecfov~f(|7|1EbzTyy*^c4| zc!}a78{?Dni)k78@`a3B%f2wkRDc5n5lN;a7!}MtU5uMSnHYmc&;xvPa>p-_h=3Rp9t0XHDuW24lBHcD7gqP!$aLsh`DOHPdl zc)E-XJmu*=Jy3;Ve(JnZ(%xo}ukf8QWqcpRoK~Poe{i4^btFTTVOd+&AswlsVHs7~ zA@fgw<&wI)1W~Rha7W;)Om7Q#Wu^ zSNzVcmGtE@R0c|t#w$NnG-WakqaaZR57)}von=y%KMI902>t2IjoOZ8b#OXgMUm@8 z=tDrLDHm*Iu-@cKj_0_B8TP`U=XIo^Oh|qcs1P#7KY`ykXTAXd literal 0 HcmV?d00001 diff --git a/kernel/tests/data/basic-decimal-table/part=234.00000/.part-00000-ac109189-97e5-49af-947f-335a5e46ee5c.c000.snappy.parquet.crc b/kernel/tests/data/basic-decimal-table/part=234.00000/.part-00000-ac109189-97e5-49af-947f-335a5e46ee5c.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..6c1db4450a8099a40b24df80627dc21b95dbd2e3 GIT binary patch literal 20 ccmYc;N@ieSU}A87blhjf57G37J0rn9@B$*Cjn0F@}jM5fOhj%Z;3nzN| zDE6KNQwJ(I+~vixHxG^7DwLtBJxFWnF6V-YLKW&LU`>S@T!HKw3#B?VQNoH6X?vMWTu-KAvUDNpz4zA6l|U1yb&<|cz|h0lyB6#% zBlMJ`yeiOd90n(8qBxHp>(0W#@b%4m(4PfSJekkL9ZT=%Z6da|9Pv@8<10H_28kZc zpX<{gaeOCn#P8fnNnR`hrLV+EwDb~1Q^u1Z^kb!ef2GWvX)2}pqmUc@z?)2+uxV>% z3#YSX7&>liErtMyuA8Rj;_XlwTWL9A_v+18BqKPeB_L_EktxG{|yd1?(q zH%#4XG~MMS^c&-z?*+cmGrf*yj9uJo*YjM<=-IY))3u$p5p)B;>$aqUOh|SRz%O@% GKY>4eHP2`O literal 0 HcmV?d00001 diff --git a/kernel/tests/data/basic-decimal-table/part=2342222.23454/.part-00000-d5a0c70f-7cd3-4d32-a9c0-7171a06547c6.c000.snappy.parquet.crc b/kernel/tests/data/basic-decimal-table/part=2342222.23454/.part-00000-d5a0c70f-7cd3-4d32-a9c0-7171a06547c6.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..808f4b946f9aa7fa603ebf4a625bf4b9620f8af5 GIT binary patch literal 20 ccmYc;N@ieSU}E4F-d=lolk|s%JZC`Fi+ilj+8mP!9ik}6UP zF2toiic}bSKs(^c!>`fxz?h-Qal&3p%R~5SHuG30MZIwZ~!f(d-@Ouz*Qh*9Q=RhZF zPlh7HGFPlU+EaVoJgU4y=HCGGDa(GFuupinL)23|q%s4yR`7k=N2SZ8%e2VPQ)lMv z8{QXy(lZ`*2tDH{uL|@lhrvmjC{Dve-I+NUzP>sK{iz>Dqv=H4u=J+hAYy&Z5gi9Q z9@+8SkM(f+OdtEP<2kV-{@@l${Cwss9VLpxxf?5*G93AV7b%?w3uWSr6Di3bh0N&r z?r7`;HCr?5IGxOcz!?PSLx8U-7i^`oSS~e|VXiZbhGzdC#9D`#4b5o$lk#Xu#Jwzp zTjQ{rrB*d`!_=*6Z7?4NUUk^^T;DU=rrUIl;Q;sAa@~Ptv~AnEJ+PgI;kSIRHK4_Y34xy literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/.00000000000000000000.json.crc b/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/.00000000000000000000.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..2f15b9f18717a51e71629864728e911aeb97b2ac GIT binary patch literal 16 XcmYc;N@ieSU}CtC!E_>F`93KCB*O(9 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/.00000000000000000001.json.crc b/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/.00000000000000000001.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..429df32513f2be78efa61c1f3f9812a1ca59be7c GIT binary patch literal 28 kcmYc;N@ieSU}7jbw3G4Oxu1_#<|Q<({2a95bZ?+O0HuHpF8}}l literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000000.json b/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000000.json new file mode 100644 index 000000000..f17ec3568 --- /dev/null +++ b/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1712333988110,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[\"tsNtzPartition\"]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0-SNAPSHOT","txnId":"fecbfd56-6849-421b-8439-070f0d694787"}} +{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tsNtz\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tsNtzPartition\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["tsNtzPartition"],"configuration":{},"createdTime":1712333987987}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}} diff --git a/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000001.json b/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000001.json new file mode 100644 index 000000000..67749ceec --- /dev/null +++ b/kernel/tests/data/data-reader-timestamp_ntz/_delta_log/00000000000000000001.json @@ -0,0 +1,5 @@ +{"commitInfo":{"timestamp":1712333992682,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"4","numOutputRows":"9","numOutputBytes":"2940"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0-SNAPSHOT","txnId":"39f277cb-1414-419a-b634-f6a983ed9b37"}} +{"add":{"path":"tsNtzPartition=2013-07-05%2017%253A01%253A00.123456/part-00000-6240e68e-2304-449a-a1e6-0e24866d3508.c000.snappy.parquet","partitionValues":{"tsNtzPartition":"2013-07-05 17:01:00.123456"},"size":726,"modificationTime":1712333992612,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"tsNtz\":\"2021-11-18T02:30:00.123\"},\"maxValues\":{\"id\":3,\"tsNtz\":\"2021-11-18T02:30:00.123\"},\"nullCount\":{\"id\":0,\"tsNtz\":0}}"}} +{"add":{"path":"tsNtzPartition=2021-11-18%2002%253A30%253A00.123456/part-00000-65fcd5cb-f2f3-44f4-96ef-f43825143ba9.c000.snappy.parquet","partitionValues":{"tsNtzPartition":"2021-11-18 02:30:00.123456"},"size":742,"modificationTime":1712333992666,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"id\":0,\"tsNtz\":\"2013-07-05T17:01:00.123\"},\"maxValues\":{\"id\":2,\"tsNtz\":\"2021-11-18T02:30:00.123\"},\"nullCount\":{\"id\":0,\"tsNtz\":1}}"}} +{"add":{"path":"tsNtzPartition=__HIVE_DEFAULT_PARTITION__/part-00001-53fd3b3b-7773-459a-921c-bb64bf0bbd03.c000.snappy.parquet","partitionValues":{"tsNtzPartition":null},"size":742,"modificationTime":1712333992612,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"id\":6,\"tsNtz\":\"2013-07-05T17:01:00.123\"},\"maxValues\":{\"id\":8,\"tsNtz\":\"2021-11-18T02:30:00.123\"},\"nullCount\":{\"id\":0,\"tsNtz\":1}}"}} +{"add":{"path":"tsNtzPartition=2013-07-05%2017%253A01%253A00.123456/part-00001-336e3e5f-a202-4bd9-b117-28d871bbb639.c000.snappy.parquet","partitionValues":{"tsNtzPartition":"2013-07-05 17:01:00.123456"},"size":730,"modificationTime":1712333992659,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"id\":4,\"tsNtz\":\"2013-07-05T17:01:00.123\"},\"maxValues\":{\"id\":5,\"tsNtz\":\"2013-07-05T17:01:00.123\"},\"nullCount\":{\"id\":0,\"tsNtz\":1}}"}} diff --git a/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/.part-00000-6240e68e-2304-449a-a1e6-0e24866d3508.c000.snappy.parquet.crc b/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/.part-00000-6240e68e-2304-449a-a1e6-0e24866d3508.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..e7bef1966b6d42c8486f8fd8f0e31f9a625f8749 GIT binary patch literal 16 XcmYc;N@ieSU}Cso@|E>Virq&5CfEhY literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/.part-00001-336e3e5f-a202-4bd9-b117-28d871bbb639.c000.snappy.parquet.crc b/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/.part-00001-336e3e5f-a202-4bd9-b117-28d871bbb639.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..194342167d77ac963f1ac1d16778876cd2a6ae47 GIT binary patch literal 16 XcmYc;N@ieSU}9ij5yWX8pLZDm9Q_0w literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/part-00000-6240e68e-2304-449a-a1e6-0e24866d3508.c000.snappy.parquet b/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/part-00000-6240e68e-2304-449a-a1e6-0e24866d3508.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c8ad1d4ac0754e15b80ca728ff37277e1123ea9f GIT binary patch literal 726 zcmah{L2J}N6rN0Xmrz7dc7_CUSb_~L#33QO?RHs2JrpStidT^`*%{Z+B)d&!m3G;? zhaNo^i+6vCH?QKwKcGjizHH*A;6VtJ_rCYN@4au5lP52Hf=HJbU?b@^By*hR@qR$2sSAsC9H2qO<0#L zs*@w&A=9EE3Cjp7<7an@InU2di^&_Aa?%2D2OBsRh_oPRt0GjFJKg73_bnG%5Te+$ zOxGzlGzT4|nhgaouU5!7 zt`~op=Dpf!&u4zX{N8S~h!fEhQQsd1zIP}F(LvCclfhx&iQ$1a5nTHFzU+HLAtJtK Qx)wt4pndp$C-6uA0sMZdl>h($ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/part-00001-336e3e5f-a202-4bd9-b117-28d871bbb639.c000.snappy.parquet b/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=2013-07-05 17%3A01%3A00.123456/part-00001-336e3e5f-a202-4bd9-b117-28d871bbb639.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8fa4bfba7de9f256685baa5ed7e347f3132fb493 GIT binary patch literal 730 zcmah{O^ee|6urr$Ll&YaeUAjP2!Sabh>wJ}Go4aIcQQyBTy!Hcy!1WCV3Kx{SH&sa zxsqAT!lf_^NAQ>U1N;Tj1E`)552LcMbb+_4{M%HcWquk2=WkAn*=w0{VxZcdZt1ZAVqnlx#YA1~wVz!+aC{^KP{Vd? zzhgSc9_)f?mtdk(gC5gK#~fEXRSQz+ZZ@nkoQxK60E%4cq)l%$!k{xnueDM&Z7Vh% zH!hlp>fNo)ruwh=O)36EBaeW7(x}lf>1VBt#(m40=Vy#BIGSY}-|Vf-i+G+{tsWb& z0IdC6h~hk9ctqz*saQUL$5N@d;EG$<+R9V!7t$Mfh02#R<@w&LSSF(IMlV)g#?!JD z3oO^LI8*XW=2*y~oALa3@x2N0=V@*cs$PEz}Y zs*N%-An_kMurPFFtQZ*BnOGP)v+xfP*N#alF(CQlew=geJgO~st4|D)1-M|NEcxM^*JmQnzZ!f;zx(20Bb7*Pr$LN>wtpa=T`G>DpWw3AS zwpGpq@Rjp*n_`lc%@qj4f=95egY8h093n1@`lO#l*y%mInNE4~`Y0W~k})S`EN<$S z9tt333@R#pqRuv}kKb=NUN#e`VLP^0HyvcR4KVEtOmu3{1A16Dd&`}c3sPw08!j`P zj8<^~ihR^bm0oIuLA~#ga-p=GmWw^3{d&K*cCofb^~TEEx_VvwG86x)k$XTdG-}jM z`bn#zanG_Q$%yeON3$&V&ECi)4JTu((qwJsgSGV!B0Y{69?|hkDwa%6SS%G6Tyg7M zo4e|CD&4M|s$@1)uIIiCWh7F!`+V+>d7QOEf#m`gj+GqA1PkL?6!AfX0Axzf{VVEr z=Zl56BIl>S=BqH4sp9eUXspiur}AO})}vzayKz!0oYn#sG+9t<3}#^@YGTj|I?ce} q745-Jvn7Y^-KH-(JN{5`8Egl#<#&V_@GaYO0Puos{C@lRhyMY;hOpEC literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=__HIVE_DEFAULT_PARTITION__/.part-00001-53fd3b3b-7773-459a-921c-bb64bf0bbd03.c000.snappy.parquet.crc b/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=__HIVE_DEFAULT_PARTITION__/.part-00001-53fd3b3b-7773-459a-921c-bb64bf0bbd03.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..964841a09be8e8c6e498cf8a5d7d714dc60ad0c6 GIT binary patch literal 16 XcmYc;N@ieSU}A`WaD2x12;T<)DTD@9 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=__HIVE_DEFAULT_PARTITION__/part-00001-53fd3b3b-7773-459a-921c-bb64bf0bbd03.c000.snappy.parquet b/kernel/tests/data/data-reader-timestamp_ntz/tsNtzPartition=__HIVE_DEFAULT_PARTITION__/part-00001-53fd3b3b-7773-459a-921c-bb64bf0bbd03.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1d1e6fb698eb89bce4dcc18d812fbbb370506a7d GIT binary patch literal 742 zcmah{O^ee|6ur%)nSdfV^gR;Dq6DXOAU+b>&U8wV5nPCf41>%ufjuDqfAqn#cJKg)&voTL!oMipyGT}tW;$#_qj78>lo=)8%Px4kIu)KstLnQ|?#lmnB$GjIK0GZNr|BAYu z>1^h$$i?Ze`6^0erg$l2qbd8X{e~~vdwySV8SDnK>9>XG@g3W90Puoc{C Date: Fri, 12 Jul 2024 14:25:06 -0700 Subject: [PATCH 42/54] comment fixups --- kernel/src/engine/arrow_utils.rs | 35 ++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index e8fffb228..c0b52bf2d 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -54,10 +54,10 @@ fn check_cast_compat( /// Ensure a kernel data type matches an arrow data type. This only ensures that the actual "type" /// is the same, but does so recursively into structs, and ensures lists and maps have the correct -/// associated types as well. This returns an `Ok(())` if the types are compatible, or an error if -/// the types do not match. If there is a `struct` type included, we only ensure that the named -/// fields that the kernel is asking for exist, and that for those fields the types -/// match. Un-selected fields are ignored. +/// associated types as well. This returns an `Ok(DataTypeCompat)` if the types are compatible, and +/// will indicate what kind of compatibility they have, or an error if the types do not match. If +/// there is a `struct` type included, we only ensure that the named fields that the kernel is +/// asking for exist, and that for those fields the types match. Un-selected fields are ignored. pub(crate) fn ensure_data_types( kernel_type: &DataType, arrow_type: &ArrowDataType, @@ -175,9 +175,12 @@ pub(crate) fn ensure_data_types( * 1. Loop over each field in parquet_schema, keeping track of how many physical fields (i.e. actual * stored columns) we have seen so far * 2. If a requested field matches the physical field, push the index of the field onto the mask. -* 3. Also push a ReorderIndex element that indicates where this item should be in the final output. -* 4. If a nested element (struct/map/list) is encountered, recurse into it, pushing indices onto the -* same vector, but producing a new reorder level, which is added to the parent with a `Child` kind + +* 3. Also push a ReorderIndex element that indicates where this item should be in the final output, +* and if it needs any transformation (i.e. casting, create null column) +* 4. If a nested element (struct/map/list) is encountered, recurse into it, pushing indices onto +* the same vector, but producing a new reorder level, which is added to the parent with a `Child` +* transform * * `generate_mask` is simple, and just calls `ProjectionMask::leaves` in the parquet crate with the * indices computed by `get_requested_indices` @@ -187,14 +190,16 @@ pub(crate) fn ensure_data_types( * 2. If ordered we're done, return, otherwise: * 3. Create a Vec[None, ..., None] of placeholders that will hold the correctly ordered columns * 4. Deconstruct the existing struct array and then loop over the `ReorderIndex` list -* 5. If the `kind` is Index: put the column at the correct location -* 6. If the `kind` is Missing: put a column of `null` at the correct location -* 7. If the `kind` is Child([child_order]) and the data is a `StructArray` o, recursively call -* `reorder_struct_array` on the column with `child_order` and put the resulting, now correctly -* ordered array, at the correct location -* 8. If the `kind` is Child and the data is a `List`, get the inner struct array out of -* the list, reorder it recursively as above, rebuild the list, and the put the column at the -* correct location +* 5. If the `transform` is Index: put the column at the correct location +* 6. If the `transform` is Cast: cast the column to the specified type, and put it at the correct +* location +* 7. If the `transform` is Missing: put a column of `null` at the correct location +* 8. If the `transform` is Child([child_order]) and the data is a `StructArray` o, recursively call +* `reorder_struct_array` on the column with `child_order` and put the resulting, now correctly +* ordered array, at the correct location +* 9. If the `transform` is Child and the data is a `List`, get the inner struct array +* out of the list, reorder it recursively as above, rebuild the list, and the put the column +* at the correct location * * Example: * The parquet crate treats columns being actually "flat", so a struct column is purely a schema From f5ca39e5aa9085272effe59e9b5af6e7ed746ad4 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 12 Jul 2024 14:27:48 -0700 Subject: [PATCH 43/54] rename new_index -> new_none --- kernel/src/engine/arrow_utils.rs | 82 ++++++++++++++++---------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index c0b52bf2d..89850f580 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -278,7 +278,7 @@ impl ReorderIndex { } } - fn new_index(index: usize) -> Self { + fn new_none(index: usize) -> Self { ReorderIndex { index, transform: ReorderIndexTransform::None, @@ -441,7 +441,7 @@ fn get_indices( // note that we found this field found_fields.insert(requested_field.name()); // push the child reorder on, currently no reordering for maps - reorder_indices.push(ReorderIndex::new_index(index)); + reorder_indices.push(ReorderIndex::new_none(index)); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -455,7 +455,7 @@ fn get_indices( { match ensure_data_types(&requested_field.data_type, field.data_type())? { DataTypeCompat::Identical => - reorder_indices.push(ReorderIndex::new_index(index)), + reorder_indices.push(ReorderIndex::new_none(index)), DataTypeCompat::NeedsCast(target) => reorder_indices.push(ReorderIndex::new_cast(index, target)), DataTypeCompat::Nested => return @@ -741,9 +741,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ - ReorderIndex::new_index(0), - ReorderIndex::new_index(1), - ReorderIndex::new_index(2), + ReorderIndex::new_none(0), + ReorderIndex::new_none(1), + ReorderIndex::new_none(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -797,7 +797,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; - let expect_reorder = vec![ReorderIndex::new_index(0)]; + let expect_reorder = vec![ReorderIndex::new_none(0)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -818,9 +818,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ - ReorderIndex::new_index(2), - ReorderIndex::new_index(0), - ReorderIndex::new_index(1), + ReorderIndex::new_none(2), + ReorderIndex::new_none(0), + ReorderIndex::new_none(1), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -841,8 +841,8 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; let expect_reorder = vec![ - ReorderIndex::new_index(0), - ReorderIndex::new_index(2), + ReorderIndex::new_none(0), + ReorderIndex::new_none(2), ReorderIndex::new_missing(1, Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true))), ]; assert_eq!(mask_indices, expect_mask); @@ -868,12 +868,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - ReorderIndex::new_index(0), + ReorderIndex::new_none(0), ReorderIndex::new_child( 1, - vec![ReorderIndex::new_index(0), ReorderIndex::new_index(1)], + vec![ReorderIndex::new_none(0), ReorderIndex::new_none(1)], ), - ReorderIndex::new_index(2), + ReorderIndex::new_none(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -898,12 +898,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - ReorderIndex::new_index(2), + ReorderIndex::new_none(2), ReorderIndex::new_child( 0, - vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], ), - ReorderIndex::new_index(1), + ReorderIndex::new_none(1), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -925,9 +925,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - ReorderIndex::new_index(0), - ReorderIndex::new_child(1, vec![ReorderIndex::new_index(0)]), - ReorderIndex::new_index(2), + ReorderIndex::new_none(0), + ReorderIndex::new_child(1, vec![ReorderIndex::new_none(0)]), + ReorderIndex::new_none(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -957,9 +957,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ - ReorderIndex::new_index(0), - ReorderIndex::new_index(1), - ReorderIndex::new_index(2), + ReorderIndex::new_none(0), + ReorderIndex::new_none(1), + ReorderIndex::new_none(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1006,12 +1006,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - ReorderIndex::new_index(0), + ReorderIndex::new_none(0), ReorderIndex::new_child( 1, - vec![ReorderIndex::new_index(0), ReorderIndex::new_index(1)], + vec![ReorderIndex::new_none(0), ReorderIndex::new_none(1)], ), - ReorderIndex::new_index(2), + ReorderIndex::new_none(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1055,9 +1055,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - ReorderIndex::new_index(0), - ReorderIndex::new_child(1, vec![ReorderIndex::new_index(0)]), - ReorderIndex::new_index(2), + ReorderIndex::new_none(0), + ReorderIndex::new_child(1, vec![ReorderIndex::new_none(0)]), + ReorderIndex::new_none(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1105,12 +1105,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 2, 3, 4]; let expect_reorder = vec![ - ReorderIndex::new_index(0), + ReorderIndex::new_none(0), ReorderIndex::new_child( 1, - vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], ), - ReorderIndex::new_index(2), + ReorderIndex::new_none(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1160,12 +1160,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![2, 3, 4, 5]; let expect_reorder = vec![ - ReorderIndex::new_index(2), + ReorderIndex::new_none(2), ReorderIndex::new_child( 1, - vec![ReorderIndex::new_index(0), ReorderIndex::new_index(1)], + vec![ReorderIndex::new_none(0), ReorderIndex::new_none(1)], ), - ReorderIndex::new_index(0), + ReorderIndex::new_none(0), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1189,7 +1189,7 @@ mod tests { #[test] fn simple_reorder_struct() { let arry = make_struct_array(); - let reorder = vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)]; + let reorder = vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)]; let ordered = reorder_struct_array(arry, &reorder).unwrap(); assert_eq!(ordered.column_names(), vec!["c", "b"]); } @@ -1224,13 +1224,13 @@ mod tests { let reorder = vec![ ReorderIndex::new_child( 1, - vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], ), ReorderIndex::new_child( 0, vec![ - ReorderIndex::new_index(0), - ReorderIndex::new_index(1), + ReorderIndex::new_none(0), + ReorderIndex::new_none(1), ReorderIndex::new_missing( 2, Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), @@ -1283,7 +1283,7 @@ mod tests { let struct_array = StructArray::from(vec![(list_dt, list as ArrowArrayRef)]); let reorder = vec![ReorderIndex::new_child( 0, - vec![ReorderIndex::new_index(1), ReorderIndex::new_index(0)], + vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], )]; let ordered = reorder_struct_array(struct_array, &reorder).unwrap(); let ordered_list_col = ordered.column(0).as_list::(); From b5808574fdced29821a156588fa1f2220ce4a979 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Fri, 12 Jul 2024 14:33:10 -0700 Subject: [PATCH 44/54] fix tranform checks since we now inverted --- kernel/src/engine/arrow_utils.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 89850f580..d79f30f8c 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -296,10 +296,12 @@ impl ReorderIndex { /// [`is_ordered`] to understand why this is needed. fn needs_transform(&self) -> bool { match self.transform { - ReorderIndexTransform::Cast(_) => true, - ReorderIndexTransform::Child(ref children) => is_ordered(children), - ReorderIndexTransform::None => true, - ReorderIndexTransform::Missing(_) => false, + // if we're casting or inserting null, we need to transform + ReorderIndexTransform::Cast(_) | ReorderIndexTransform::Missing(_) => true, + // if our children are not ordered somehow, we need a transform + ReorderIndexTransform::Child(ref children) => !is_ordered(children), + // no transform needed + ReorderIndexTransform::None => false, } } } @@ -533,19 +535,20 @@ pub(crate) fn generate_mask( /// Check if an ordering is already ordered. We check if the indices are in ascending order. That's /// enough to ensure we don't need to do any transformation on the data read from parquet _iff_ -/// there are no `null` columns to insert. If we _do_ need to insert a null column then we need to -/// transform the data. Therefore we also call [`contains_missing`] to ensure both the ascending -/// nature of the indices AND that no `Missing` variants exist, and only if both are true do we -/// consider an ordering "ordered". +/// there are no `null` columns to insert and no casts are needed. If we _do_ need to insert a null +/// column or cast something then we need to transform the data. Therefore we also call +/// [`needs_transform`] to ensure both the ascending nature of the indices AND that no transform is +/// required. fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { if requested_ordering.is_empty() { return true; } - // we have >=1 element. check that the first element is ordered - if !requested_ordering[0].needs_transform() { + // we have >=1 element. check that the first element doesn't need a transform + if requested_ordering[0].needs_transform() { return false; } - // now check that all elements are ordered wrt. each other, and are internally ordered + // now check that all elements are ordered wrt. each other, and internally don't need + // transformation requested_ordering .windows(2) .all(|ri| (ri[0].index < ri[1].index) && !ri[1].needs_transform()) From 8ad1933b14070acb38420db84a054a897147016d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 13:31:22 -0700 Subject: [PATCH 45/54] Apply suggestions from code review Co-authored-by: Ryan Johnson --- kernel/src/engine/arrow_utils.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index d79f30f8c..a21297608 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -171,7 +171,7 @@ pub(crate) fn ensure_data_types( * The algorithm has three parts, handled by `get_requested_indices`, `generate_mask` and * `reorder_struct_array` respectively. -* `get_requested_indices` generates indices to select and reordering information: +* `get_requested_indices` generates indices to select, along with reordering information: * 1. Loop over each field in parquet_schema, keeping track of how many physical fields (i.e. actual * stored columns) we have seen so far * 2. If a requested field matches the physical field, push the index of the field onto the mask. @@ -187,14 +187,14 @@ pub(crate) fn ensure_data_types( * * `reorder_struct_array` handles reordering: * 1. First check if we're already in order (see doc comment for `is_ordered`) -* 2. If ordered we're done, return, otherwise: +* 2. If ordered we're done (return); otherwise: * 3. Create a Vec[None, ..., None] of placeholders that will hold the correctly ordered columns * 4. Deconstruct the existing struct array and then loop over the `ReorderIndex` list * 5. If the `transform` is Index: put the column at the correct location * 6. If the `transform` is Cast: cast the column to the specified type, and put it at the correct * location * 7. If the `transform` is Missing: put a column of `null` at the correct location -* 8. If the `transform` is Child([child_order]) and the data is a `StructArray` o, recursively call +* 8. If the `transform` is Child([child_order]) and the data is a `StructArray`, recursively call * `reorder_struct_array` on the column with `child_order` and put the resulting, now correctly * ordered array, at the correct location * 9. If the `transform` is Child and the data is a `List`, get the inner struct array @@ -202,8 +202,8 @@ pub(crate) fn ensure_data_types( * at the correct location * * Example: -* The parquet crate treats columns being actually "flat", so a struct column is purely a schema -* level thing and doesn't "count" wrt. column indices. +* The parquet crate `ProjectionMask::leaves` method only considers leaf columns -- a "flat" schema -- +* so a struct column is purely a schema level thing and doesn't "count" wrt. column indices. * * So if we have the following file physical schema: * From fa61467d1114395739cf07501b08cda3e4eaa202 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 13:44:21 -0700 Subject: [PATCH 46/54] addressing lots of comments --- kernel/src/engine/arrow_utils.rs | 96 ++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 43 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index a21297608..1e117ce28 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -37,10 +37,11 @@ pub(crate) enum DataTypeCompat { // Check if two types can be cast fn check_cast_compat( - source_type: &ArrowDataType, target_type: ArrowDataType, + source_type: &ArrowDataType, ) -> DeltaResult { match (source_type, &target_type) { + (source_type, target_type) if source_type == target_type => Ok(DataTypeCompat::Identical), (&ArrowDataType::Timestamp(_, _), &ArrowDataType::Timestamp(_, _)) => { // timestamps are able to be cast between each other Ok(DataTypeCompat::NeedsCast(target_type)) @@ -64,12 +65,7 @@ pub(crate) fn ensure_data_types( ) -> DeltaResult { match (kernel_type, arrow_type) { (DataType::Primitive(_), _) if arrow_type.is_primitive() => { - let converted_type: ArrowDataType = kernel_type.try_into()?; - if &converted_type == arrow_type { - Ok(DataTypeCompat::Identical) - } else { - check_cast_compat(arrow_type, converted_type) - } + check_cast_compat(kernel_type.try_into()?, arrow_type) } (DataType::Primitive(PrimitiveType::Boolean), ArrowDataType::Boolean) | (DataType::Primitive(PrimitiveType::String), ArrowDataType::Utf8) @@ -91,21 +87,24 @@ pub(crate) fn ensure_data_types( } (DataType::Map(kernel_map_type), ArrowDataType::Map(arrow_map_type, _)) => { if let ArrowDataType::Struct(fields) = arrow_map_type.data_type() { - let mut fiter = fields.iter(); - if let Some(key_type) = fiter.next() { + let mut fields = fields.iter(); + if let Some(key_type) = fields.next() { ensure_data_types(&kernel_map_type.key_type, key_type.data_type())?; } else { return Err(make_arrow_error( "Arrow map struct didn't have a key type".to_string(), )); } - if let Some(value_type) = fiter.next() { + if let Some(value_type) = fields.next() { ensure_data_types(&kernel_map_type.value_type, value_type.data_type())?; } else { return Err(make_arrow_error( "Arrow map struct didn't have a value type".to_string(), )); } + if fields.next().is_some() { + return Err(Error::generic("map fields had more than 2 members")); + } Ok(DataTypeCompat::Nested) } else { Err(make_arrow_error( @@ -117,7 +116,7 @@ pub(crate) fn ensure_data_types( // build a list of kernel fields that matches the order of the arrow fields let mapped_fields = arrow_fields .iter() - .flat_map(|f| kernel_fields.fields.get(f.name())); + .filter_map(|f| kernel_fields.fields.get(f.name())); // keep track of how many fields we matched up let mut found_fields = 0; @@ -129,11 +128,19 @@ pub(crate) fn ensure_data_types( // require that we found the number of fields that we requested. require!(kernel_fields.fields.len() == found_fields, { - let kernel_field_names = kernel_fields.fields.keys().join(", "); - let arrow_field_names = arrow_fields.iter().map(|f| f.name()).join(", "); + let arrow_field_map: HashSet<&String> = HashSet::from_iter( + arrow_fields.iter().map(|f| f.name()) + ); + let missing_field_names = kernel_fields.fields.keys().filter_map(|kernel_field| { + if arrow_field_map.contains(kernel_field) { + None + } else { + Some(kernel_field) + } + }).take(5).join(", "); make_arrow_error(format!( - "Missing Struct fields. Requested: {}, found: {}", - kernel_field_names, arrow_field_names, + "Missing Struct fields {} (Up to five missing fields shown)", + missing_field_names )) }); Ok(DataTypeCompat::Nested) @@ -153,17 +160,20 @@ pub(crate) fn ensure_data_types( * At a high level there are three schemas/concepts to worry about: * - The parquet file's physical schema (= the columns that are actually available), called * "parquet_schema" below -* - The requested logical schema from the engine (= the columns we actually want, a superset of -* the read schema), called "requested_schema" below +* - The requested logical schema from the engine (= the columns we actually want), called +* "requested_schema" below +* - The Read schema (and intersection of 1. and 2., in logical schema order). This is never +* materialized, but is useful to be able to refer to here * - A `ProjectionMask` that goes to the parquet reader which specifies which subset of columns from * the file schema to actually read. (See "Example" below) * -* In other words, the ProjectionMask is the intersection of file schema and logical schema, and then -* mapped to indices in the parquet file. Columns unique to the file schema need to be masked out (= -* ignored), while columns unique to the logical schema need to be backfilled with nulls. +* In other words, the ProjectionMask is the intersection of the parquet schema and logical schema, +* and then mapped to indices in the parquet file. Columns unique to the file schema need to be +* masked out (= ignored), while columns unique to the logical schema need to be backfilled with +* nulls. * -* We also have to worry about field ordering differences between read schema and logical schema. We -* represent any reordering needed as a tree. Each level of the tree is a vec of +* We also have to worry about field ordering differences between the read schema and logical +* schema. We represent any reordering needed as a tree. Each level of the tree is a vec of * `ReorderIndex`s. Each element's index represents a column that will be in the read parquet data * (as an arrow StructArray) at that level and index. The `ReorderIndex::index` field of the element * is the position that the column should appear in the final output. @@ -172,14 +182,14 @@ pub(crate) fn ensure_data_types( * `reorder_struct_array` respectively. * `get_requested_indices` generates indices to select, along with reordering information: -* 1. Loop over each field in parquet_schema, keeping track of how many physical fields (i.e. actual -* stored columns) we have seen so far +* 1. Loop over each field in parquet_schema, keeping track of how many physical fields (i.e. leaf +* columns) we have seen so far * 2. If a requested field matches the physical field, push the index of the field onto the mask. * 3. Also push a ReorderIndex element that indicates where this item should be in the final output, * and if it needs any transformation (i.e. casting, create null column) * 4. If a nested element (struct/map/list) is encountered, recurse into it, pushing indices onto -* the same vector, but producing a new reorder level, which is added to the parent with a `Child` +* the same vector, but producing a new reorder level, which is added to the parent with a `Nested` * transform * * `generate_mask` is simple, and just calls `ProjectionMask::leaves` in the parquet crate with the @@ -190,16 +200,16 @@ pub(crate) fn ensure_data_types( * 2. If ordered we're done (return); otherwise: * 3. Create a Vec[None, ..., None] of placeholders that will hold the correctly ordered columns * 4. Deconstruct the existing struct array and then loop over the `ReorderIndex` list -* 5. If the `transform` is Index: put the column at the correct location -* 6. If the `transform` is Cast: cast the column to the specified type, and put it at the correct -* location -* 7. If the `transform` is Missing: put a column of `null` at the correct location -* 8. If the `transform` is Child([child_order]) and the data is a `StructArray`, recursively call -* `reorder_struct_array` on the column with `child_order` and put the resulting, now correctly -* ordered array, at the correct location -* 9. If the `transform` is Child and the data is a `List`, get the inner struct array -* out of the list, reorder it recursively as above, rebuild the list, and the put the column -* at the correct location +* 5. Use the `ReorderIndex::index` value to put the column at the correct location +* 6. Additionally, if `ReorderIndex::transform` is not `Identity`, then if it is: +* - `Cast`: cast the column to the specified type +* - `Missing`: put a column of `null` at the correct location +* - `Nested([child_order])` and the data is a `StructArray`: recursively call +* `reorder_struct_array` on the column with `child_order` to correctly ordered the child +* array +* - `Nested` and the data is a `List`: get the inner struct array out of the list, +* reorder it recursively as above, rebuild the list, and the put the column at the correct +* location * * Example: * The parquet crate `ProjectionMask::leaves` method only considers leaf columns -- a "flat" schema -- @@ -231,10 +241,10 @@ pub(crate) fn ensure_data_types( * The reorder tree is: * [ * // col a is at position 0 in the struct array, and should be moved to position 1 -* { index: 1, Child([{ index: 0 }]) }, +* { index: 1, Nested([{ index: 0 }]) }, * // col b is at position 1 in the struct array, and should be moved to position 0 * // also, the inner struct array needs to be reordered to swap 'f' and 'e' -* { index: 0, Child([{ index: 1 }, {index: 0}]) }, +* { index: 0, Nested([{ index: 1 }, {index: 0}]) }, * // col c is at position 2 in the struct array, and should stay there * { index: 2 } * ] @@ -256,7 +266,7 @@ pub(crate) enum ReorderIndexTransform { /// For a non-nested type, indicates that we need to cast to the contained type Cast(ArrowDataType), /// Used for struct/list/map. Potentially transform child fields using contained reordering - Child(Vec), + Nested(Vec), /// No work needed to transform this data None, /// Data is missing, fill in with a null column @@ -274,7 +284,7 @@ impl ReorderIndex { fn new_child(index: usize, children: Vec) -> Self { ReorderIndex { index, - transform: ReorderIndexTransform::Child(children), + transform: ReorderIndexTransform::Nested(children), } } @@ -299,7 +309,7 @@ impl ReorderIndex { // if we're casting or inserting null, we need to transform ReorderIndexTransform::Cast(_) | ReorderIndexTransform::Missing(_) => true, // if our children are not ordered somehow, we need a transform - ReorderIndexTransform::Child(ref children) => !is_ordered(children), + ReorderIndexTransform::Nested(ref children) => !is_ordered(children), // no transform needed ReorderIndexTransform::None => false, } @@ -590,7 +600,7 @@ pub(crate) fn reorder_struct_array( ); final_fields_cols[reorder_index.index] = Some((new_field, new_col)); } - ReorderIndexTransform::Child(children) => { + ReorderIndexTransform::Nested(children) => { match input_cols[parquet_position].data_type() { ArrowDataType::Struct(_) => { let struct_array = input_cols[parquet_position].as_struct().clone(); @@ -623,7 +633,7 @@ pub(crate) fn reorder_struct_array( // TODO: MAP _ => { return Err(Error::generic( - "Child reorder can only apply to struct/list/map. This is a kernel bug, please report" + "Nested reorder can only apply to struct/list/map. This is a kernel bug, please report" )); } } @@ -685,7 +695,7 @@ fn reorder_list( Ok(Some((new_field, list))) } else { Err(Error::generic( - "Child reorder of list should have had struct child. This is a kernel bug, please report" + "Nested reorder of list should have had struct child. This is a kernel bug, please report" )) } } From f6ff0f3beee89a5202b4bdbb49b0aadb0629320a Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 13:45:37 -0700 Subject: [PATCH 47/54] None -> Identity --- kernel/src/engine/arrow_utils.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 1e117ce28..9c82542ad 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -268,7 +268,7 @@ pub(crate) enum ReorderIndexTransform { /// Used for struct/list/map. Potentially transform child fields using contained reordering Nested(Vec), /// No work needed to transform this data - None, + Identity, /// Data is missing, fill in with a null column Missing(ArrowFieldRef), } @@ -291,7 +291,7 @@ impl ReorderIndex { fn new_none(index: usize) -> Self { ReorderIndex { index, - transform: ReorderIndexTransform::None, + transform: ReorderIndexTransform::Identity, } } @@ -311,7 +311,7 @@ impl ReorderIndex { // if our children are not ordered somehow, we need a transform ReorderIndexTransform::Nested(ref children) => !is_ordered(children), // no transform needed - ReorderIndexTransform::None => false, + ReorderIndexTransform::Identity => false, } } } @@ -638,7 +638,7 @@ pub(crate) fn reorder_struct_array( } } } - ReorderIndexTransform::None => { + ReorderIndexTransform::Identity => { final_fields_cols[reorder_index.index] = Some(( input_fields[parquet_position].clone(), // cheap Arc clone input_cols[parquet_position].clone(), // cheap Arc clone From 68bec02f32a6ccee668fc9aff49a235fa76dbdb1 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 13:49:18 -0700 Subject: [PATCH 48/54] cleaner new methods for ReorderIndex --- kernel/src/engine/arrow_utils.rs | 168 +++++++++++++++---------------- 1 file changed, 82 insertions(+), 86 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 9c82542ad..87e8c0d3d 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -128,16 +128,20 @@ pub(crate) fn ensure_data_types( // require that we found the number of fields that we requested. require!(kernel_fields.fields.len() == found_fields, { - let arrow_field_map: HashSet<&String> = HashSet::from_iter( - arrow_fields.iter().map(|f| f.name()) - ); - let missing_field_names = kernel_fields.fields.keys().filter_map(|kernel_field| { - if arrow_field_map.contains(kernel_field) { - None - } else { - Some(kernel_field) - } - }).take(5).join(", "); + let arrow_field_map: HashSet<&String> = + HashSet::from_iter(arrow_fields.iter().map(|f| f.name())); + let missing_field_names = kernel_fields + .fields + .keys() + .filter_map(|kernel_field| { + if arrow_field_map.contains(kernel_field) { + None + } else { + Some(kernel_field) + } + }) + .take(5) + .join(", "); make_arrow_error(format!( "Missing Struct fields {} (Up to five missing fields shown)", missing_field_names @@ -212,7 +216,7 @@ pub(crate) fn ensure_data_types( * location * * Example: -* The parquet crate `ProjectionMask::leaves` method only considers leaf columns -- a "flat" schema -- +* The parquet crate `ProjectionMask::leaves` method only considers leaf columns -- a "flat" schema -- * so a struct column is purely a schema level thing and doesn't "count" wrt. column indices. * * So if we have the following file physical schema: @@ -274,32 +278,24 @@ pub(crate) enum ReorderIndexTransform { } impl ReorderIndex { - fn new_cast(index: usize, target: ArrowDataType) -> Self { - ReorderIndex { - index, - transform: ReorderIndexTransform::Cast(target), - } + fn new(index: usize, transform: ReorderIndexTransform) -> Self { + ReorderIndex { index, transform } } - fn new_child(index: usize, children: Vec) -> Self { - ReorderIndex { - index, - transform: ReorderIndexTransform::Nested(children), - } + fn cast(index: usize, target: ArrowDataType) -> Self { + ReorderIndex::new(index, ReorderIndexTransform::Cast(target)) } - fn new_none(index: usize) -> Self { - ReorderIndex { - index, - transform: ReorderIndexTransform::Identity, - } + fn nested(index: usize, children: Vec) -> Self { + ReorderIndex::new(index, ReorderIndexTransform::Nested(children)) } - fn new_missing(index: usize, field: ArrowFieldRef) -> Self { - ReorderIndex { - index, - transform: ReorderIndexTransform::Missing(field), - } + fn identity(index: usize) -> Self { + ReorderIndex::new(index, ReorderIndexTransform::Identity) + } + + fn missing(index: usize, field: ArrowFieldRef) -> Self { + ReorderIndex::new(index, ReorderIndexTransform::Missing(field)) } /// Check if this reordering requires a transformation anywhere. See comment below on @@ -371,7 +367,7 @@ fn get_indices( // note that we found this field found_fields.insert(requested_field.name()); // push the child reorder on - reorder_indices.push(ReorderIndex::new_child(index, children)); + reorder_indices.push(ReorderIndex::nested(index, children)); } else { return Err(Error::unexpected_column_type(field.name())); } @@ -453,7 +449,7 @@ fn get_indices( // note that we found this field found_fields.insert(requested_field.name()); // push the child reorder on, currently no reordering for maps - reorder_indices.push(ReorderIndex::new_none(index)); + reorder_indices.push(ReorderIndex::identity(index)); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -467,9 +463,9 @@ fn get_indices( { match ensure_data_types(&requested_field.data_type, field.data_type())? { DataTypeCompat::Identical => - reorder_indices.push(ReorderIndex::new_none(index)), + reorder_indices.push(ReorderIndex::identity(index)), DataTypeCompat::NeedsCast(target) => - reorder_indices.push(ReorderIndex::new_cast(index, target)), + reorder_indices.push(ReorderIndex::cast(index, target)), DataTypeCompat::Nested => return Err(Error::generic( "Comparing nested types in get_indices. This is a kernel bug, please report" @@ -487,7 +483,7 @@ fn get_indices( if !found_fields.contains(field.name()) { if field.nullable { debug!("Inserting missing and nullable field: {}", field.name()); - reorder_indices.push(ReorderIndex::new_missing( + reorder_indices.push(ReorderIndex::missing( requested_position, Arc::new(field.try_into()?), )); @@ -754,9 +750,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_none(1), - ReorderIndex::new_none(2), + ReorderIndex::identity(0), + ReorderIndex::identity(1), + ReorderIndex::identity(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -810,7 +806,7 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; - let expect_reorder = vec![ReorderIndex::new_none(0)]; + let expect_reorder = vec![ReorderIndex::identity(0)]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); } @@ -831,9 +827,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ - ReorderIndex::new_none(2), - ReorderIndex::new_none(0), - ReorderIndex::new_none(1), + ReorderIndex::identity(2), + ReorderIndex::identity(0), + ReorderIndex::identity(1), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -854,9 +850,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_none(2), - ReorderIndex::new_missing(1, Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true))), + ReorderIndex::identity(0), + ReorderIndex::identity(2), + ReorderIndex::missing(1, Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true))), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -881,12 +877,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_child( + ReorderIndex::identity(0), + ReorderIndex::nested( 1, - vec![ReorderIndex::new_none(0), ReorderIndex::new_none(1)], + vec![ReorderIndex::identity(0), ReorderIndex::identity(1)], ), - ReorderIndex::new_none(2), + ReorderIndex::identity(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -911,12 +907,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - ReorderIndex::new_none(2), - ReorderIndex::new_child( + ReorderIndex::identity(2), + ReorderIndex::nested( 0, - vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], + vec![ReorderIndex::identity(1), ReorderIndex::identity(0)], ), - ReorderIndex::new_none(1), + ReorderIndex::identity(1), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -938,9 +934,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_child(1, vec![ReorderIndex::new_none(0)]), - ReorderIndex::new_none(2), + ReorderIndex::identity(0), + ReorderIndex::nested(1, vec![ReorderIndex::identity(0)]), + ReorderIndex::identity(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -970,9 +966,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_none(1), - ReorderIndex::new_none(2), + ReorderIndex::identity(0), + ReorderIndex::identity(1), + ReorderIndex::identity(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1019,12 +1015,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 2, 3]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_child( + ReorderIndex::identity(0), + ReorderIndex::nested( 1, - vec![ReorderIndex::new_none(0), ReorderIndex::new_none(1)], + vec![ReorderIndex::identity(0), ReorderIndex::identity(1)], ), - ReorderIndex::new_none(2), + ReorderIndex::identity(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1068,9 +1064,9 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1, 3]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_child(1, vec![ReorderIndex::new_none(0)]), - ReorderIndex::new_none(2), + ReorderIndex::identity(0), + ReorderIndex::nested(1, vec![ReorderIndex::identity(0)]), + ReorderIndex::identity(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1118,12 +1114,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 2, 3, 4]; let expect_reorder = vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_child( + ReorderIndex::identity(0), + ReorderIndex::nested( 1, - vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], + vec![ReorderIndex::identity(1), ReorderIndex::identity(0)], ), - ReorderIndex::new_none(2), + ReorderIndex::identity(2), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1173,12 +1169,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![2, 3, 4, 5]; let expect_reorder = vec![ - ReorderIndex::new_none(2), - ReorderIndex::new_child( + ReorderIndex::identity(2), + ReorderIndex::nested( 1, - vec![ReorderIndex::new_none(0), ReorderIndex::new_none(1)], + vec![ReorderIndex::identity(0), ReorderIndex::identity(1)], ), - ReorderIndex::new_none(0), + ReorderIndex::identity(0), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1202,7 +1198,7 @@ mod tests { #[test] fn simple_reorder_struct() { let arry = make_struct_array(); - let reorder = vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)]; + let reorder = vec![ReorderIndex::identity(1), ReorderIndex::identity(0)]; let ordered = reorder_struct_array(arry, &reorder).unwrap(); assert_eq!(ordered.column_names(), vec!["c", "b"]); } @@ -1235,16 +1231,16 @@ mod tests { ), ]); let reorder = vec![ - ReorderIndex::new_child( + ReorderIndex::nested( 1, - vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], + vec![ReorderIndex::identity(1), ReorderIndex::identity(0)], ), - ReorderIndex::new_child( + ReorderIndex::nested( 0, vec![ - ReorderIndex::new_none(0), - ReorderIndex::new_none(1), - ReorderIndex::new_missing( + ReorderIndex::identity(0), + ReorderIndex::identity(1), + ReorderIndex::missing( 2, Arc::new(ArrowField::new("s", ArrowDataType::Utf8, true)), ), @@ -1294,9 +1290,9 @@ mod tests { false, )); let struct_array = StructArray::from(vec![(list_dt, list as ArrowArrayRef)]); - let reorder = vec![ReorderIndex::new_child( + let reorder = vec![ReorderIndex::nested( 0, - vec![ReorderIndex::new_none(1), ReorderIndex::new_none(0)], + vec![ReorderIndex::identity(1), ReorderIndex::identity(0)], )]; let ordered = reorder_struct_array(struct_array, &reorder).unwrap(); let ordered_list_col = ordered.column(0).as_list::(); @@ -1323,8 +1319,8 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask: Vec = vec![]; let expect_reorder = vec![ - ReorderIndex::new_missing(0, nots_field.with_name("s").into()), - ReorderIndex::new_missing(1, noti2_field.with_name("i2").into()), + ReorderIndex::missing(0, nots_field.with_name("s").into()), + ReorderIndex::missing(1, noti2_field.with_name("i2").into()), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); From 62fb031aca8f4c3e39e3d37d49b43580d3476034 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 14:31:52 -0700 Subject: [PATCH 49/54] renaming to ordering_needs_transform --- kernel/src/engine/arrow_utils.rs | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 87e8c0d3d..001790cef 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -199,9 +199,10 @@ pub(crate) fn ensure_data_types( * `generate_mask` is simple, and just calls `ProjectionMask::leaves` in the parquet crate with the * indices computed by `get_requested_indices` * -* `reorder_struct_array` handles reordering: -* 1. First check if we're already in order (see doc comment for `is_ordered`) -* 2. If ordered we're done (return); otherwise: +* `reorder_struct_array` handles reordering and data transforms: +* 1. First check if we need to do any transformations (see doc comment for +* `ordering_needs_transform`) +* 2. If nothing is required we're done (return); otherwise: * 3. Create a Vec[None, ..., None] of placeholders that will hold the correctly ordered columns * 4. Deconstruct the existing struct array and then loop over the `ReorderIndex` list * 5. Use the `ReorderIndex::index` value to put the column at the correct location @@ -299,13 +300,13 @@ impl ReorderIndex { } /// Check if this reordering requires a transformation anywhere. See comment below on - /// [`is_ordered`] to understand why this is needed. + /// [`ordering_needs_transform`] to understand why this is needed. fn needs_transform(&self) -> bool { match self.transform { // if we're casting or inserting null, we need to transform ReorderIndexTransform::Cast(_) | ReorderIndexTransform::Missing(_) => true, - // if our children are not ordered somehow, we need a transform - ReorderIndexTransform::Nested(ref children) => !is_ordered(children), + // if our nested ordering needs a transform, we need a transform + ReorderIndexTransform::Nested(ref children) => ordering_needs_transform(children), // no transform needed ReorderIndexTransform::Identity => false, } @@ -539,25 +540,24 @@ pub(crate) fn generate_mask( )) } -/// Check if an ordering is already ordered. We check if the indices are in ascending order. That's -/// enough to ensure we don't need to do any transformation on the data read from parquet _iff_ -/// there are no `null` columns to insert and no casts are needed. If we _do_ need to insert a null -/// column or cast something then we need to transform the data. Therefore we also call -/// [`needs_transform`] to ensure both the ascending nature of the indices AND that no transform is -/// required. -fn is_ordered(requested_ordering: &[ReorderIndex]) -> bool { +/// Check if an ordering requires transforming the data in any way. This is true if the indices are +/// NOT in ascending order (so we have to reorder things), or if we need to do any transformation on +/// the data read from parquet. We check the ordering here, and also call +/// `ReorderIndex::needs_transform` on each element to check for other transforms, and to check +/// `Nested` variants recursively. +fn ordering_needs_transform(requested_ordering: &[ReorderIndex]) -> bool { if requested_ordering.is_empty() { - return true; + return false; } // we have >=1 element. check that the first element doesn't need a transform if requested_ordering[0].needs_transform() { - return false; + return true; } - // now check that all elements are ordered wrt. each other, and internally don't need - // transformation + // Check for all elements if we need a transform. This is true if any elements are not in order + // (i.e. element[i].index < element[i+1].index), or any element needs a transform requested_ordering .windows(2) - .all(|ri| (ri[0].index < ri[1].index) && !ri[1].needs_transform()) + .any(|ri| (ri[0].index >= ri[1].index) || ri[1].needs_transform()) } // we use this as a placeholder for an array and its associated field. We can fill in a Vec of None @@ -570,7 +570,7 @@ pub(crate) fn reorder_struct_array( input_data: StructArray, requested_ordering: &[ReorderIndex], ) -> DeltaResult { - if is_ordered(requested_ordering) { + if !ordering_needs_transform(requested_ordering) { // indices is already sorted, meaning we requested in the order that the columns were // stored in the parquet Ok(input_data) From 38e9e9f4bbf54f0cf7d39cd45929917c75dfca7c Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 14:39:28 -0700 Subject: [PATCH 50/54] address a few comments --- kernel/src/engine/arrow_utils.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 001790cef..347b0d294 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -586,21 +586,22 @@ pub(crate) fn reorder_struct_array( // requested_ordering tells us where it is in the parquet data match &reorder_index.transform { ReorderIndexTransform::Cast(target) => { - let source_col = input_cols[parquet_position].as_ref(); - let new_col = Arc::new(arrow_cast::cast::cast(source_col, target)?); + let col = input_cols[parquet_position].as_ref(); + let col = Arc::new(arrow_cast::cast::cast(col, target)?); let new_field = Arc::new( input_fields[parquet_position] .as_ref() .clone() - .with_data_type(new_col.data_type().clone()), + .with_data_type(col.data_type().clone()), ); - final_fields_cols[reorder_index.index] = Some((new_field, new_col)); + final_fields_cols[reorder_index.index] = Some((new_field, col)); } ReorderIndexTransform::Nested(children) => { match input_cols[parquet_position].data_type() { ArrowDataType::Struct(_) => { let struct_array = input_cols[parquet_position].as_struct().clone(); - let result_array = reorder_struct_array(struct_array, children)?; + let result_array = + Arc::new(reorder_struct_array(struct_array, children)?); // create the new field specifying the correct order for the struct let new_field = Arc::new(ArrowField::new_struct( input_fields[parquet_position].name(), @@ -608,7 +609,7 @@ pub(crate) fn reorder_struct_array( input_fields[parquet_position].is_nullable(), )); final_fields_cols[reorder_index.index] = - Some((new_field, Arc::new(result_array))); + Some((new_field, result_array)); } ArrowDataType::List(_) => { let list_array = input_cols[parquet_position].as_list::().clone(); From cf47ea9aa6024d9d981a895142a2cdbdbb7c388c Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 16:25:50 -0700 Subject: [PATCH 51/54] clean up iterators --- kernel/src/engine/arrow_utils.rs | 122 ++++++++++++++++++------------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 347b0d294..26627ca96 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -133,13 +133,7 @@ pub(crate) fn ensure_data_types( let missing_field_names = kernel_fields .fields .keys() - .filter_map(|kernel_field| { - if arrow_field_map.contains(kernel_field) { - None - } else { - Some(kernel_field) - } - }) + .filter(|kernel_field| !arrow_field_map.contains(kernel_field)) .take(5) .join(", "); make_arrow_error(format!( @@ -255,11 +249,11 @@ pub(crate) fn ensure_data_types( * ] */ -/// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's index -/// represents a column that will be in the read parquet data at that level and index. The `index` -/// of the element is the position that the column should appear in the final output. The `transform` -/// indicates what, if any, transforms are needed. See the docs for [`ReorderIndexTransform`] for the -/// meaning. +/// Reordering is specified as a tree. Each level is a vec of `ReorderIndex`s. Each element's +/// position represents a column that will be in the read parquet data at that level and +/// position. The `index` of the element is the position that the column should appear in the final +/// output. The `transform` indicates what, if any, transforms are needed. See the docs for +/// [`ReorderIndexTransform`] for the meaning. #[derive(Debug, PartialEq)] pub(crate) struct ReorderIndex { pub(crate) index: usize, @@ -344,16 +338,21 @@ fn get_indices( let mut found_fields = HashSet::with_capacity(requested_schema.fields.len()); let mut reorder_indices = Vec::with_capacity(requested_schema.fields.len()); let mut parquet_offset = start_parquet_offset; - for (parquet_index, field) in fields.iter().enumerate() { + // for each field, get its position in the parquet (via enumerate), a reference to the arrow + // field, and info about where it appears in the requested_schema, or None if the field is not + // requested + let all_field_info = fields.iter().enumerate().map(|(parquet_index, field)| { + let field_info = requested_schema.fields.get_full(field.name()); + (parquet_index, field, field_info) + }); + for (parquet_index, field, field_info) in all_field_info { debug!( "Getting indices for field {} with offset {parquet_offset}, with index {parquet_index}", field.name() ); - match field.data_type() { - ArrowDataType::Struct(fields) => { - if let Some((index, _, requested_field)) = - requested_schema.fields.get_full(field.name()) - { + if let Some((index, _, requested_field)) = field_info { + match field.data_type() { + ArrowDataType::Struct(fields) => { if let DataType::Struct(ref requested_schema) = requested_field.data_type { let (parquet_advance, children) = get_indices( parquet_index + parquet_offset, @@ -372,20 +371,10 @@ fn get_indices( } else { return Err(Error::unexpected_column_type(field.name())); } - } else { - // We're NOT selecting this field, but we still need to update how much we skip - debug!("Skipping over un-selected struct: {}", field.name()); - // offset by number of inner fields. subtract one, because the enumerate still - // counts this field - parquet_offset += count_cols(field) - 1; } - } - ArrowDataType::List(list_field) - | ArrowDataType::LargeList(list_field) - | ArrowDataType::ListView(list_field) => { - if let Some((index, _, requested_field)) = - requested_schema.fields.get_full(field.name()) - { + ArrowDataType::List(list_field) + | ArrowDataType::LargeList(list_field) + | ArrowDataType::ListView(list_field) => { // we just want to transparently recurse into lists, need to transform the kernel // list data type into a schema if let DataType::Array(array_type) = requested_field.data_type() { @@ -418,11 +407,7 @@ fn get_indices( return Err(Error::unexpected_column_type(list_field.name())); } } - } - ArrowDataType::Map(key_val_field, _) => { - if let Some((index, _, requested_field)) = - requested_schema.fields.get_full(field.name()) - { + ArrowDataType::Map(key_val_field, _) => { match (key_val_field.data_type(), requested_field.data_type()) { (ArrowDataType::Struct(inner_fields), DataType::Map(map_type)) => { let mut key_val_names = @@ -457,27 +442,31 @@ fn get_indices( } } } - } - _ => { - if let Some((index, _, requested_field)) = - requested_schema.fields.get_full(field.name()) - { + _ => { match ensure_data_types(&requested_field.data_type, field.data_type())? { - DataTypeCompat::Identical => - reorder_indices.push(ReorderIndex::identity(index)), - DataTypeCompat::NeedsCast(target) => - reorder_indices.push(ReorderIndex::cast(index, target)), - DataTypeCompat::Nested => return - Err(Error::generic( - "Comparing nested types in get_indices. This is a kernel bug, please report" - )) - } + DataTypeCompat::Identical => + reorder_indices.push(ReorderIndex::identity(index)), + DataTypeCompat::NeedsCast(target) => + reorder_indices.push(ReorderIndex::cast(index, target)), + DataTypeCompat::Nested => return + Err(Error::generic( + "Comparing nested types in get_indices. This is a kernel bug, please report" + )) + } found_fields.insert(requested_field.name()); mask_indices.push(parquet_offset + parquet_index); } } + } else { + // We're NOT selecting this field, but we still need to track how many leaf columns we + // skipped over + debug!("Skipping over un-selected field: {}", field.name()); + // offset by number of inner fields. subtract one, because the enumerate still + // counts this logical "parent" field + parquet_offset += count_cols(field) - 1; } } + if found_fields.len() != requested_schema.fields.len() { // some fields are missing, but they might be nullable, need to insert them into the reorder_indices for (requested_position, field) in requested_schema.fields().enumerate() { @@ -1027,6 +1016,39 @@ mod tests { assert_eq!(reorder_indices, expect_reorder); } + #[test] + fn nested_indices_unselected_list() { + let requested_schema = Arc::new(StructType::new(vec![ + StructField::new("i", DataType::INTEGER, false), + StructField::new("j", DataType::INTEGER, false), + ])); + let parquet_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "nested", + ArrowDataType::Struct( + vec![ + ArrowField::new("int32", ArrowDataType::Int32, false), + ArrowField::new("string", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + ))), + false, + ), + ArrowField::new("j", ArrowDataType::Int32, false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); + let expect_mask = vec![0, 3]; + let expect_reorder = vec![ReorderIndex::identity(0), ReorderIndex::identity(1)]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + #[test] fn nested_indices_list_mask_inner() { let requested_schema = Arc::new(StructType::new(vec![ From c477e1e150dd0c165efa344ba4e6e1f0960ef57b Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 16:37:22 -0700 Subject: [PATCH 52/54] add and use InternalError type --- kernel/src/engine/arrow_utils.rs | 46 +++++++++++++++++--------------- kernel/src/error.rs | 8 ++++++ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 26627ca96..e7c1bec8e 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -444,15 +444,18 @@ fn get_indices( } _ => { match ensure_data_types(&requested_field.data_type, field.data_type())? { - DataTypeCompat::Identical => - reorder_indices.push(ReorderIndex::identity(index)), - DataTypeCompat::NeedsCast(target) => - reorder_indices.push(ReorderIndex::cast(index, target)), - DataTypeCompat::Nested => return - Err(Error::generic( - "Comparing nested types in get_indices. This is a kernel bug, please report" - )) - } + DataTypeCompat::Identical => { + reorder_indices.push(ReorderIndex::identity(index)) + } + DataTypeCompat::NeedsCast(target) => { + reorder_indices.push(ReorderIndex::cast(index, target)) + } + DataTypeCompat::Nested => { + return Err(Error::internal_error( + "Comparing nested types in get_indices", + )) + } + } found_fields.insert(requested_field.name()); mask_indices.push(parquet_offset + parquet_index); } @@ -618,9 +621,9 @@ pub(crate) fn reorder_struct_array( } // TODO: MAP _ => { - return Err(Error::generic( - "Nested reorder can only apply to struct/list/map. This is a kernel bug, please report" - )); + return Err(Error::internal_error( + "Nested reorder can only apply to struct/list/map.", + )); } } } @@ -641,15 +644,14 @@ pub(crate) fn reorder_struct_array( let (field_vec, reordered_columns): (Vec>, _) = final_fields_cols.into_iter().flatten().unzip(); if field_vec.len() != num_cols { - return Err(Error::generic( - "Found a None in final_fields_cols. This is a kernel bug, please report.", - )); + Err(Error::internal_error("Found a None in final_fields_cols.")) + } else { + Ok(StructArray::try_new( + field_vec.into(), + reordered_columns, + null_buffer, + )?) } - Ok(StructArray::try_new( - field_vec.into(), - reordered_columns, - null_buffer, - )?) } } @@ -680,8 +682,8 @@ fn reorder_list( )?); Ok(Some((new_field, list))) } else { - Err(Error::generic( - "Nested reorder of list should have had struct child. This is a kernel bug, please report" + Err(Error::internal_error( + "Nested reorder of list should have had struct child.", )) } } diff --git a/kernel/src/error.rs b/kernel/src/error.rs index ccaa1dc98..e79fa6260 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -52,6 +52,10 @@ pub enum Error { #[error(transparent)] IOError(std::io::Error), + /// An internal error that means kernel found an unexpected situation, which is likely a bug + #[error("Internal error {0}. This is a kernel bug, please report.")] + InternalError(String), + /// An error enountered while working with parquet data #[cfg(feature = "parquet")] #[error("Arrow error: {0}")] @@ -193,6 +197,10 @@ impl Error { Self::InvalidStructData(msg.to_string()) } + pub fn internal_error(msg: impl ToString) -> Self { + Self::InternalError(msg.to_string()).with_backtrace() + } + // Capture a backtrace when the error is constructed. #[must_use] pub fn with_backtrace(self) -> Self { From 74530f13fa79c6dfd99a2cdae1a5ff71f1eaec57 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 16:40:43 -0700 Subject: [PATCH 53/54] use swap_remove --- kernel/src/engine/arrow_utils.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index e7c1bec8e..522454d19 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -383,7 +383,7 @@ fn get_indices( array_type.element_type.clone(), array_type.contains_null, )]); - let (parquet_advance, children) = get_indices( + let (parquet_advance, mut children) = get_indices( found_fields.len() + parquet_offset, &requested_schema, &[list_field.clone()].into(), @@ -398,7 +398,7 @@ fn get_indices( )); } // safety, checked that we have 1 element - let mut children = children.into_iter().next().unwrap(); + let mut children = children.swap_remove(0); // the index is wrong, as it's the index from the inner schema. Adjust // it to be our index children.index = index; From 2f42578f5ef3aac5a4895c93ef0cfa5d117c197d Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Mon, 15 Jul 2024 16:45:03 -0700 Subject: [PATCH 54/54] minor ffi fixes --- ffi/src/lib.rs | 2 ++ ffi/src/scan.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index cf3685896..0942056f3 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -263,6 +263,7 @@ pub enum KernelError { InvalidTableLocationError, InvalidDecimalError, InvalidStructDataError, + InternalError, } impl From for KernelError { @@ -303,6 +304,7 @@ impl From for KernelError { Error::InvalidTableLocation(_) => KernelError::InvalidTableLocationError, Error::InvalidDecimal(_) => KernelError::InvalidDecimalError, Error::InvalidStructData(_) => KernelError::InvalidStructDataError, + Error::InternalError(_) => KernelError::InternalError, Error::Backtraced { source, backtrace: _, diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index e4e9c5867..cbee3ee3f 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -5,7 +5,7 @@ use std::ffi::c_void; use std::sync::{Arc, Mutex}; use delta_kernel::scan::state::{visit_scan_files, DvInfo, GlobalScanState}; -use delta_kernel::scan::{Scan, ScanBuilder, ScanData}; +use delta_kernel::scan::{Scan, ScanData}; use delta_kernel::schema::Schema; use delta_kernel::snapshot::Snapshot; use delta_kernel::{DeltaResult, EngineData, Error};