delta-io · nicklan · Jul 16, 2024 · Jun 27, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,6 +24,7 @@ version = "0.1.1"
 arrow = { version = "^52.0" }
 arrow-arith = { version = "^52.0" }
 arrow-array = { version = "^52.0" }
+arrow-cast = { version = "^52.0" }
 arrow-data = { version = "^52.0" }
 arrow-ord = { version = "^52.0" }
 arrow-json = { version = "^52.0" }

diff --git a/acceptance/src/data.rs b/acceptance/src/data.rs
@@ -60,11 +60,12 @@ pub fn sort_record_batch(batch: RecordBatch) -> DeltaResult<RecordBatch> {
     Ok(RecordBatch::try_new(batch.schema(), columns)?)
 }
 
-static SKIPPED_TESTS: &[&str; 1] = &[
-    // For multi_partitioned_2: The golden table stores the timestamp as an INT96 (which is
-    // nanosecond precision), while the spec says we should read partition columns as
-    // microseconds. This means the read and golden data don't line up. When this is released in
-    // `dat` upstream, we can stop skipping this test
+static SKIPPED_TESTS: &[&str; 2] = &[
+    // For all_primitive_types and multi_partitioned_2: The golden table stores the timestamp as an
+    // INT96 (which is nanosecond precision), while the spec says we should read partition columns
+    // as microseconds. This means the read and golden data don't line up. When this is released in
+    // `dat` upstream, we can stop skipping these tests
+    "all_primitive_types",
     "multi_partitioned_2",
 ];
 

diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs
@@ -263,6 +263,7 @@ pub enum KernelError {
     InvalidTableLocationError,
     InvalidDecimalError,
     InvalidStructDataError,
+    InternalError,
 }
 
 impl From<Error> for KernelError {
@@ -303,6 +304,7 @@ impl From<Error> for KernelError {
             Error::InvalidTableLocation(_) => KernelError::InvalidTableLocationError,
             Error::InvalidDecimal(_) => KernelError::InvalidDecimalError,
             Error::InvalidStructData(_) => KernelError::InvalidStructDataError,
+            Error::InternalError(_) => KernelError::InternalError,
             Error::Backtraced {
                 source,
                 backtrace: _,

diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs
@@ -5,7 +5,7 @@ use std::ffi::c_void;
 use std::sync::{Arc, Mutex};
 
 use delta_kernel::scan::state::{visit_scan_files, DvInfo, GlobalScanState};
-use delta_kernel::scan::{Scan, ScanBuilder, ScanData};
+use delta_kernel::scan::{Scan, ScanData};
 use delta_kernel::schema::Schema;
 use delta_kernel::snapshot::Snapshot;
 use delta_kernel::{DeltaResult, EngineData, Error};

diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml
@@ -37,9 +37,10 @@ delta_kernel_derive = { path = "../derive-macros", version = "0.1.1" }
 visibility = "0.1.0"
 
 # Used in default engine
-arrow-array = { workspace = true, optional = true }
+arrow-array = { workspace = true, optional = true, features = ["chrono-tz"] }
 arrow-select = { workspace = true, optional = true }
 arrow-arith = { workspace = true, optional = true }
+arrow-cast = { workspace = true, optional = true }
 arrow-json = { workspace = true, optional = true }
 arrow-ord = { workspace = true, optional = true }
 arrow-schema = { workspace = true, optional = true }
@@ -67,6 +68,7 @@ default-engine = [
   "arrow-conversion",
   "arrow-expression",
   "arrow-array",
+  "arrow-cast",
   "arrow-json",
   "arrow-schema",
   "arrow-select",
@@ -80,6 +82,7 @@ default-engine = [
 
 developer-visibility = []
 sync-engine = [
+  "arrow-cast",
   "arrow-conversion",
   "arrow-expression",
   "arrow-array",

diff --git a/kernel/src/engine/arrow_expression.rs b/kernel/src/engine/arrow_expression.rs
@@ -17,10 +17,10 @@ use itertools::Itertools;
 
 use super::arrow_conversion::LIST_ARRAY_ROOT;
 use crate::engine::arrow_data::ArrowEngineData;
+use crate::engine::arrow_utils::ensure_data_types;
 use crate::error::{DeltaResult, Error};
 use crate::expressions::{BinaryOperator, Expression, Scalar, UnaryOperator, VariadicOperator};
 use crate::schema::{DataType, PrimitiveType, SchemaRef};
-use crate::utils::require;
 use crate::{EngineData, ExpressionEvaluator, ExpressionHandler};
 
 // TODO leverage scalars / Datum
@@ -161,93 +161,6 @@ fn column_as_struct<'a>(
         .ok_or(ArrowError::SchemaError(format!("{} is not a struct", name)))
 }
 
-fn make_arrow_error(s: String) -> Error {
-    Error::Arrow(arrow_schema::ArrowError::InvalidArgumentError(s))
-}
-
-/// Ensure a kernel data type matches an arrow data type. This only ensures that the actual "type"
-/// is the same, but does so recursively into structs, and ensures lists and maps have the correct
-/// associated types as well. This returns an `Ok(())` if the types are compatible, or an error if
-/// the types do not match. If there is a `struct` type included, we only ensure that the named
-/// fields that the kernel is asking for exist, and that for those fields the types
-/// match. Un-selected fields are ignored.
-fn ensure_data_types(kernel_type: &DataType, arrow_type: &ArrowDataType) -> DeltaResult<()> {
-    match (kernel_type, arrow_type) {
-        (DataType::Primitive(_), _) if arrow_type.is_primitive() => Ok(()),
-        (DataType::Primitive(PrimitiveType::Boolean), ArrowDataType::Boolean)
-        | (DataType::Primitive(PrimitiveType::String), ArrowDataType::Utf8)
-        | (DataType::Primitive(PrimitiveType::Binary), ArrowDataType::Binary) => {
-            // strings, bools, and binary  aren't primitive in arrow
-            Ok(())
-        }
-        (
-            DataType::Primitive(PrimitiveType::Decimal(kernel_prec, kernel_scale)),
-            ArrowDataType::Decimal128(arrow_prec, arrow_scale),
-        ) if arrow_prec == kernel_prec && *arrow_scale == *kernel_scale as i8 => {
-            // decimal isn't primitive in arrow. cast above is okay as we limit range
-            Ok(())
-        }
-        (DataType::Array(inner_type), ArrowDataType::List(arrow_list_type)) => {
-            let kernel_array_type = &inner_type.element_type;
-            let arrow_list_type = arrow_list_type.data_type();
-            ensure_data_types(kernel_array_type, arrow_list_type)
-        }
-        (DataType::Map(kernel_map_type), ArrowDataType::Map(arrow_map_type, _)) => {
-            if let ArrowDataType::Struct(fields) = arrow_map_type.data_type() {
-                let mut fiter = fields.iter();
-                if let Some(key_type) = fiter.next() {
-                    ensure_data_types(&kernel_map_type.key_type, key_type.data_type())?;
-                } else {
-                    return Err(make_arrow_error(
-                        "Arrow map struct didn't have a key type".to_string(),
-                    ));
-                }
-                if let Some(value_type) = fiter.next() {
-                    ensure_data_types(&kernel_map_type.value_type, value_type.data_type())?;
-                } else {
-                    return Err(make_arrow_error(
-                        "Arrow map struct didn't have a value type".to_string(),
-                    ));
-                }
-                Ok(())
-            } else {
-                Err(make_arrow_error(
-                    "Arrow map type wasn't a struct.".to_string(),
-                ))
-            }
-        }
-        (DataType::Struct(kernel_fields), ArrowDataType::Struct(arrow_fields)) => {
-            // build a list of kernel fields that matches the order of the arrow fields
-            let mapped_fields = arrow_fields
-                .iter()
-                .flat_map(|f| kernel_fields.fields.get(f.name()));
-
-            // keep track of how many fields we matched up
-            let mut found_fields = 0;
-            // ensure that for the fields that we found, the types match
-            for (kernel_field, arrow_field) in mapped_fields.zip(arrow_fields) {
-                ensure_data_types(&kernel_field.data_type, arrow_field.data_type())?;
-                found_fields += 1;
-            }
-
-            // require that we found the number of fields that we requested.
-            require!(kernel_fields.fields.len() == found_fields, {
-                let kernel_field_names = kernel_fields.fields.keys().join(", ");
-                let arrow_field_names = arrow_fields.iter().map(|f| f.name()).join(", ");
-                make_arrow_error(format!(
-                    "Missing Struct fields. Requested: {}, found: {}",
-                    kernel_field_names, arrow_field_names,
-                ))
-            });
-            Ok(())
-        }
-        _ => Err(make_arrow_error(format!(
-            "Incorrect datatype. Expected {}, got {}",
-            kernel_type, arrow_type
-        ))),
-    }
-}
-
 fn evaluate_expression(
     expression: &Expression,
     batch: &RecordBatch,