From b7e67023f28ed995ad1c709883f82d2e49e1f22a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 13 Dec 2023 09:07:25 -0800 Subject: [PATCH 01/27] partially implement FFI --- ffi/Cargo.toml | 5 +- ffi/Makefile | 27 +++ ffi/cbindgen.toml | 10 ++ ffi/cffi-test.c | 62 +++++++ ffi/kernel.h | 106 ++++++++++++ ffi/kernel.hpp | 97 +++++++++++ ffi/src/lib.rs | 414 +++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 717 insertions(+), 4 deletions(-) create mode 100644 ffi/Makefile create mode 100644 ffi/cbindgen.toml create mode 100644 ffi/cffi-test.c create mode 100644 ffi/kernel.h create mode 100644 ffi/kernel.hpp diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index 8cafb4c30..c6eb62861 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -12,8 +12,11 @@ build = "build.rs" [lib] crate-type = ["cdylib", "staticlib"] +[dependencies] +url = "2" +deltakernel = { path = "../kernel", features = ["developer-visibility", "tokio"] } + [build-dependencies] cbindgen = "0.26.0" -deltakernel = { path = "../kernel" } libc = "0.2.147" diff --git a/ffi/Makefile b/ffi/Makefile new file mode 100644 index 000000000..c04848f20 --- /dev/null +++ b/ffi/Makefile @@ -0,0 +1,27 @@ +BINARY=cffi-test +SOURCES=cffi-test.c +INCPATHS=kernel.h +LIBPATHS=../target/debug +LDFLAGS=-ldeltakernel #`pkg-config --libs arrow-glib` +CFLAGS=-c -Wall #`pkg-config --cflags arrow-glib` +CC=gcc + +OBJECTS=$(SOURCES:.c=.o) +#INCFLAGS=$(foreach TMP,$(INCPATHS),-I$(TMP)) +LIBFLAGS=$(foreach TMP,$(LIBPATHS),-L$(TMP)) + +all: $(SOURCES) $(BINARY) + +$(BINARY): $(OBJECTS) + $(CC) $(LIBFLAGS) $(OBJECTS) $(LDFLAGS) -o $@ +.c.o: + $(CC) $(INCFLAGS) $(CFLAGS) -fPIC $< -o $@ + +run: + LD_LIBRARY_PATH=$(LIBPATHS) ./$(BINARY) $(table) + +distclean: clean + rm -f $(BINARY) + +clean: + rm -f $(OBJECTS) diff --git a/ffi/cbindgen.toml b/ffi/cbindgen.toml new file mode 100644 index 000000000..634a95a1f --- /dev/null +++ b/ffi/cbindgen.toml @@ -0,0 +1,10 @@ +after_includes = """ + +// ====================================================================================== +// Missing forward declarations from deltakernel crate, added manually via cbindgen.toml +// ====================================================================================== +struct Snapshot; +struct TokioBackgroundExecutor; +template struct DefaultTableClient; +// ====================================================================================== +""" diff --git a/ffi/cffi-test.c b/ffi/cffi-test.c new file mode 100644 index 000000000..c9460f5f9 --- /dev/null +++ b/ffi/cffi-test.c @@ -0,0 +1,62 @@ +#include + +#include "kernel.h" + +/* #include */ +/* #include */ +/* #include "arrow/c/abi.h" */ + +typedef struct iter_data { + int lim; + int cur; +} iter_data; + +const void* next(void* data) { + iter_data *id = (iter_data*)data; + if (id->cur >= id->lim) { + return 0; + } else { + id->cur++; + return &id->cur; + } +} + +void release(void* data) { + printf("released\n"); +} + +void test_iter() { + iter_data it; + it.lim = 10; + it.cur = 0; + + struct EngineIterator eit = { + .data = &it, + .get_next = &next, + }; + iterate(&eit); +} + +int main(int argc, char* argv[]) { + + if (argc < 2) { + printf("Usage: %s [table_path]\n", argv[0]); + return -1; + } + + char* table_path = argv[1]; + printf("Opening table at %s\n", table_path); + DefaultTable *table = get_table_with_default_client(table_path); + DefaultSnapshot *ss = snapshot(table); + uint64_t v = version(ss); + printf("Got version: %lu\n", v); + + struct FileList fl = get_scan_files(ss, NULL); + printf("Need to read %i files\n", fl.file_count); + for (int i = 0;i < fl.file_count;i++) { + printf("file %i -> %s\n", i, fl.files[i]); + } + + test_iter(); + return 0; +} diff --git a/ffi/kernel.h b/ffi/kernel.h new file mode 100644 index 000000000..d6461fac3 --- /dev/null +++ b/ffi/kernel.h @@ -0,0 +1,106 @@ +#include +#include +#include +#include + +// ====================================================================================== +// Missing forward declarations from deltakernel crate, added manually via cbindgen.toml +// ====================================================================================== +struct Snapshot; +struct TokioBackgroundExecutor; +template struct DefaultTableClient; +// ====================================================================================== + + +typedef struct KernelExpressionVisitorState KernelExpressionVisitorState; + +/** + * Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap + * the engine functions. The engine retains ownership of the iterator. + */ +typedef struct EngineIterator { + void *data; + /** + * A function that should advance the iterator and return the next time from the data + * If the iterator is complete, it should return null. It should be safe to + * call `get_next()` multiple times if it is null. + */ + const void *(*get_next)(void *data); +} EngineIterator; + +typedef DefaultTableClient KernelDefaultTableClient; + +typedef struct EngineSchemaVisitor { + void *data; + uintptr_t (*make_field_list)(void *data, uintptr_t reserve); + void (*visit_struct)(void *data, + uintptr_t sibling_list_id, + const char *name, + uintptr_t child_list_id); + void (*visit_string)(void *data, uintptr_t sibling_list_id, const char *name); + void (*visit_integer)(void *data, uintptr_t sibling_list_id, const char *name); + void (*visit_long)(void *data, uintptr_t sibling_list_id, const char *name); +} EngineSchemaVisitor; + +typedef struct FileList { + char **files; + int32_t file_count; +} FileList; + +typedef struct EnginePredicate { + void *predicate; + uintptr_t (*visitor)(void *predicate, struct KernelExpressionVisitorState *state); +} EnginePredicate; + +/** + * test function to print for items. this assumes each item is an `int` + */ +void iterate(struct EngineIterator *it); + +const KernelDefaultTableClient *get_default_client(const char *path); + +/** + * Get the latest snapshot from the specified table + */ +const Snapshot *snapshot(const char *path, const KernelDefaultTableClient *table_client); + +/** + * Get the version of the specified snapshot + */ +uint64_t version(const Snapshot *snapshot); + +uintptr_t visit_schema(const Snapshot *snapshot, + const KernelDefaultTableClient *table_client, + struct EngineSchemaVisitor *visitor); + +uintptr_t visit_expression_and(struct KernelExpressionVisitorState *state, + struct EngineIterator *children); + +uintptr_t visit_expression_lt(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_le(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_gt(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_ge(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_eq(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_column(struct KernelExpressionVisitorState *state, const char *name); + +uintptr_t visit_expression_literal_string(struct KernelExpressionVisitorState *state, + const char *value); + +uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *state, int64_t value); + +/** + * Get a FileList for all the files that need to be read from the table. NB: This _consumes_ the + * snapshot, it is no longer valid after making this call (TODO: We should probably fix this?) + * + * # Safety + * + * Caller is responsible to pass a valid snapshot pointer. + */ +struct FileList get_scan_files(const Snapshot *snapshot, + const KernelDefaultTableClient *table_client, + struct EnginePredicate *predicate); diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp new file mode 100644 index 000000000..a8208a639 --- /dev/null +++ b/ffi/kernel.hpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include + +// ====================================================================================== +// Missing forward declarations from deltakernel crate, added manually via cbindgen.toml +// ====================================================================================== +struct Snapshot; +struct TokioBackgroundExecutor; +template struct DefaultTableClient; +// ====================================================================================== + + +struct KernelExpressionVisitorState; + +/// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap +/// the engine functions. The engine retains ownership of the iterator. +struct EngineIterator { + void *data; + /// A function that should advance the iterator and return the next time from the data + /// If the iterator is complete, it should return null. It should be safe to + /// call `get_next()` multiple times if it is null. + const void *(*get_next)(void *data); +}; + +using KernelDefaultTableClient = DefaultTableClient; + +struct EngineSchemaVisitor { + void *data; + uintptr_t (*make_field_list)(void *data, uintptr_t reserve); + void (*visit_struct)(void *data, + uintptr_t sibling_list_id, + const char *name, + uintptr_t child_list_id); + void (*visit_string)(void *data, uintptr_t sibling_list_id, const char *name); + void (*visit_integer)(void *data, uintptr_t sibling_list_id, const char *name); + void (*visit_long)(void *data, uintptr_t sibling_list_id, const char *name); +}; + +struct FileList { + char **files; + int32_t file_count; +}; + +struct EnginePredicate { + void *predicate; + uintptr_t (*visitor)(void *predicate, KernelExpressionVisitorState *state); +}; + +extern "C" { + +/// test function to print for items. this assumes each item is an `int` +void iterate(EngineIterator *it); + +const KernelDefaultTableClient *get_default_client(const char *path); + +/// Get the latest snapshot from the specified table +const Snapshot *snapshot(const char *path, const KernelDefaultTableClient *table_client); + +/// Get the version of the specified snapshot +uint64_t version(const Snapshot *snapshot); + +uintptr_t visit_schema(const Snapshot *snapshot, + const KernelDefaultTableClient *table_client, + EngineSchemaVisitor *visitor); + +uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); + +uintptr_t visit_expression_lt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_le(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_gt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_ge(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_eq(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_column(KernelExpressionVisitorState *state, const char *name); + +uintptr_t visit_expression_literal_string(KernelExpressionVisitorState *state, const char *value); + +uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int64_t value); + +/// Get a FileList for all the files that need to be read from the table. NB: This _consumes_ the +/// snapshot, it is no longer valid after making this call (TODO: We should probably fix this?) +/// +/// # Safety +/// +/// Caller is responsible to pass a valid snapshot pointer. +FileList get_scan_files(const Snapshot *snapshot, + const KernelDefaultTableClient *table_client, + EnginePredicate *predicate); + +} // extern "C" diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 11ceeb76a..94e3d8723 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -1,8 +1,17 @@ /// FFI interface for the delta kernel /// -/// Currently this only has a simple iterator type that a c/c++ engine could pass to kernel. -/// TODO: Add more and update this comment :) -use std::os::raw::c_void; +/// Exposes that an engine needs to call from C/C++ to interface with kernel +use std::collections::HashMap; +use std::ffi::{CStr, CString}; +use std::os::raw::{c_char, c_void}; +use std::path::PathBuf; +use std::sync::Arc; +use url::Url; + +use deltakernel::expressions::{scalars::Scalar, BinaryOperator, Expression}; +use deltakernel::scan::ScanBuilder; +use deltakernel::schema::{DataType, PrimitiveType, StructField, StructType}; +use deltakernel::snapshot::Snapshot; /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap /// the engine functions. The engine retains ownership of the iterator. @@ -40,3 +49,402 @@ impl Iterator for EngineIterator { } } } + +// TODO: We REALLY don't want to expose the real type of default client. Doing so forces every kernel +// API call to take either a default client or a generic client, because those are incompatible types. +use deltakernel::client::executor::tokio::TokioBackgroundExecutor; +type KernelDefaultTableClient = deltakernel::DefaultTableClient; + +/// # Safety +/// +/// Caller is responsible to pass a valid path pointer. +unsafe fn unwrap_and_parse_path_as_url(path: *const c_char) -> Option { + let path = unsafe { CStr::from_ptr(path) }; + let path = path.to_str().unwrap(); + let path = std::fs::canonicalize(PathBuf::from(path)); + let Ok(path) = path else { + println!("Couldn't open table: {}", path.err().unwrap()); + return None; + }; + let Ok(url) = Url::from_directory_path(path) else { + println!("Invalid url"); + return None; + }; + Some(url) +} + +#[no_mangle] +pub unsafe extern "C" fn get_default_client( + path: *const c_char, +) -> *const KernelDefaultTableClient { + let path = unsafe { unwrap_and_parse_path_as_url(path) }; + let Some(url) = path else { + return std::ptr::null_mut(); + }; + let table_client = KernelDefaultTableClient::try_new( + &url, + HashMap::::new(), + Arc::new(TokioBackgroundExecutor::new()), + ); + let Ok(table_client) = table_client else { + println!( + "Error creating table client: {}", + table_client.err().unwrap() + ); + return std::ptr::null_mut(); + }; + Arc::into_raw(Arc::new(table_client)) +} + +/// Get the latest snapshot from the specified table +#[no_mangle] +pub unsafe extern "C" fn snapshot( + path: *const c_char, + table_client: &KernelDefaultTableClient, +) -> *const Snapshot { + let path = unsafe { unwrap_and_parse_path_as_url(path) }; + let Some(url) = path else { + return std::ptr::null_mut(); + }; + let snapshot = Snapshot::try_new(url, table_client, None).unwrap(); + Arc::into_raw(snapshot) +} + +/// Get the version of the specified snapshot +#[no_mangle] +pub extern "C" fn version(snapshot: &Snapshot) -> u64 { + snapshot.version() +} + +// WARNING: the visitor MUST NOT retain internal references to the c_char names passed to visitor methods +// TODO: other types, nullability +#[repr(C)] +pub struct EngineSchemaVisitor { + // opaque state pointer + data: *mut c_void, + // Creates a new field list, optionally reserving capacity up front + make_field_list: extern "C" fn(data: *mut c_void, reserve: usize) -> usize, + // visitor methods that should instantiate and append the appropriate type to the field list + visit_struct: extern "C" fn( + data: *mut c_void, + sibling_list_id: usize, + name: *const c_char, + child_list_id: usize, + ) -> (), + visit_string: + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: *const c_char) -> (), + visit_integer: + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: *const c_char) -> (), + visit_long: extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: *const c_char) -> (), +} + +#[no_mangle] +pub extern "C" fn visit_schema( + snapshot: &Snapshot, + table_client: &KernelDefaultTableClient, + visitor: &mut EngineSchemaVisitor, +) -> usize { + // Visit all the fields of a struct and return the list of children + fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { + let child_list_id = (visitor.make_field_list)(visitor.data, s.fields.len()); + for field in s.fields.iter() { + visit_field(visitor, child_list_id, field); + } + child_list_id + } + + // Visit a struct field (recursively) and add the result to the list of siblings. + fn visit_field(visitor: &EngineSchemaVisitor, sibling_list_id: usize, field: &StructField) { + let name = CString::new(field.name.as_bytes()).unwrap(); + match &field.data_type { + DataType::Primitive(PrimitiveType::Integer) => { + (visitor.visit_integer)(visitor.data, sibling_list_id, name.as_ptr()) + } + DataType::Primitive(PrimitiveType::Long) => { + (visitor.visit_long)(visitor.data, sibling_list_id, name.as_ptr()) + } + DataType::Primitive(PrimitiveType::String) => { + (visitor.visit_string)(visitor.data, sibling_list_id, name.as_ptr()) + } + DataType::Struct(s) => { + let child_list_id = visit_struct_fields(visitor, s); + (visitor.visit_struct)(visitor.data, sibling_list_id, name.as_ptr(), child_list_id); + } + other => println!("Unsupported data type: {}", other), + } + } + + let schema: StructType = snapshot.schema(table_client).unwrap(); + visit_struct_fields(visitor, &schema) +} + +// A set that can identify its contents by address +pub struct ReferenceSet { + map: std::collections::HashMap, + next_id: usize, +} + +impl ReferenceSet { + pub fn new() -> Self { + Default::default() + } + + // Inserts a new value into the set. This always creates a new entry + // because the new value cannot have the same address as any existing value. + // Returns a raw pointer to the value. This pointer serves as a key that + // can be used later to take() from the set, and should NOT be dereferenced. + pub fn insert(&mut self, value: T) -> usize { + let id = self.next_id; + self.next_id += 1; + self.map.insert(id, value); + id + } + + // Attempts to remove a value from the set, if present. + pub fn take(&mut self, i: usize) -> Option { + self.map.remove(&i) + } + + // True if the set contains an object whose address matches the pointer. + pub fn contains(&self, id: usize) -> bool { + self.map.contains_key(&id) + } + + // The current size of the set. + pub fn len(&self) -> usize { + self.map.len() + } + + pub fn is_empty(&self) -> bool { + self.map.is_empty() + } +} + +impl Default for ReferenceSet { + fn default() -> Self { + Self { + map: Default::default(), + next_id: 1, + } + } +} + +#[derive(Default)] +pub struct KernelExpressionVisitorState { + // TODO: ReferenceSet> instead? + inflight_expressions: ReferenceSet, +} +impl KernelExpressionVisitorState { + fn new() -> Self { + Self { + inflight_expressions: Default::default(), + } + } +} + +// When invoking [[get_scan_files]], The engine provides a pointer to the (engine's native) +// predicate, along with a visitor function that can be invoked to recursively visit the +// predicate. This engine state is valid until the call to [[get_scan_files]] returns. Inside that +// method, the kernel allocates visitor state, which becomes the second argument to the predicate +// visitor invocation along with the engine-provided predicate pointer. The visitor state is valid +// for the lifetime of the predicate visitor invocation. Thanks to this double indirection, engine +// and kernel each retain ownership of their respective objects, with no need to coordinate memory +// lifetimes with the other. +#[repr(C)] +pub struct EnginePredicate { + predicate: *mut c_void, + visitor: + extern "C" fn(predicate: *mut c_void, state: &mut KernelExpressionVisitorState) -> usize, +} + +fn wrap_expression(state: &mut KernelExpressionVisitorState, expr: Expression) -> usize { + state.inflight_expressions.insert(expr) +} + +fn unwrap_c_string(s: *const c_char) -> String { + let s = unsafe { CStr::from_ptr(s) }; + s.to_str().unwrap().to_string() +} + +fn unwrap_kernel_expression( + state: &mut KernelExpressionVisitorState, + exprid: usize, +) -> Option> { + state.inflight_expressions.take(exprid).map(Box::new) +} + +fn visit_expression_binary( + state: &mut KernelExpressionVisitorState, + op: BinaryOperator, + a: usize, + b: usize, +) -> usize { + let left = unwrap_kernel_expression(state, a); + let right = unwrap_kernel_expression(state, b); + match left.zip(right) { + Some((left, right)) => { + wrap_expression(state, Expression::BinaryOperation { op, left, right }) + } + None => 0, // invalid child => invalid node + } +} + +// The EngineIterator is not thread safe, not reentrant, not owned by callee, not freed by callee. +#[no_mangle] +pub extern "C" fn visit_expression_and( + state: &mut KernelExpressionVisitorState, + children: &mut EngineIterator, +) -> usize { + let mut children = children.flat_map(|child| unwrap_kernel_expression(state, child as usize)); + let left = match children.next() { + Some(left) => left, + _ => return 0, + }; + let right = match children.next() { + Some(right) => right, + _ => return wrap_expression(state, *left), + }; + let mut result = Expression::BinaryOperation { + op: BinaryOperator::And, + left, + right, + }; + for child in children { + let left = Box::new(result); + result = Expression::BinaryOperation { + op: BinaryOperator::And, + left, + right: child, + }; + } + wrap_expression(state, result) +} + +#[no_mangle] +pub extern "C" fn visit_expression_lt( + state: &mut KernelExpressionVisitorState, + a: usize, + b: usize, +) -> usize { + visit_expression_binary(state, BinaryOperator::LessThan, a, b) +} + +#[no_mangle] +pub extern "C" fn visit_expression_le( + state: &mut KernelExpressionVisitorState, + a: usize, + b: usize, +) -> usize { + visit_expression_binary(state, BinaryOperator::LessThanOrEqual, a, b) +} + +#[no_mangle] +pub extern "C" fn visit_expression_gt( + state: &mut KernelExpressionVisitorState, + a: usize, + b: usize, +) -> usize { + visit_expression_binary(state, BinaryOperator::GreaterThan, a, b) +} + +#[no_mangle] +pub extern "C" fn visit_expression_ge( + state: &mut KernelExpressionVisitorState, + a: usize, + b: usize, +) -> usize { + visit_expression_binary(state, BinaryOperator::GreaterThanOrEqual, a, b) +} + +#[no_mangle] +pub extern "C" fn visit_expression_eq( + state: &mut KernelExpressionVisitorState, + a: usize, + b: usize, +) -> usize { + visit_expression_binary(state, BinaryOperator::Equal, a, b) +} + +#[no_mangle] +pub extern "C" fn visit_expression_column( + state: &mut KernelExpressionVisitorState, + name: *const c_char, +) -> usize { + wrap_expression(state, Expression::Column(unwrap_c_string(name))) +} + +#[no_mangle] +pub extern "C" fn visit_expression_literal_string( + state: &mut KernelExpressionVisitorState, + value: *const c_char, +) -> usize { + wrap_expression( + state, + Expression::Literal(Scalar::from(unwrap_c_string(value))), + ) +} + +#[no_mangle] +pub extern "C" fn visit_expression_literal_long( + state: &mut KernelExpressionVisitorState, + value: i64, +) -> usize { + wrap_expression(state, Expression::Literal(Scalar::from(value))) +} + +#[repr(C)] +pub struct FileList { + files: *mut *mut c_char, + file_count: i32, +} + +/// Get a FileList for all the files that need to be read from the table. NB: This _consumes_ the +/// snapshot, it is no longer valid after making this call (TODO: We should probably fix this?) +/// +/// # Safety +/// +/// Caller is responsible to pass a valid snapshot pointer. +#[no_mangle] +pub unsafe extern "C" fn get_scan_files( + snapshot: *const Snapshot, + table_client: *const KernelDefaultTableClient, + predicate: Option<&mut EnginePredicate>, +) -> FileList { + let snapshot: Arc = unsafe { Arc::from_raw(snapshot) }; + let table_client = unsafe { Arc::from_raw(table_client) }; + let mut scan_builder = ScanBuilder::new(snapshot); + if let Some(predicate) = predicate { + // TODO: There is a lot of redundancy between the various visit_expression_XXX methods here, + // vs. ProvidesMetadataFilter trait and the class hierarchy that supports it. Can we justify + // combining the two, so that native rust kernel code also uses the visitor idiom? Doing so + // might mean kernel no longer needs to define an expression class hierarchy of its own (at + // least, not for data skipping). Things may also look different after we remove arrow code + // from the kernel proper and make it one of the sensible default engine clients instead. + let mut visitor_state = KernelExpressionVisitorState::new(); + let exprid = (predicate.visitor)(predicate.predicate, &mut visitor_state); + if let Some(predicate) = unwrap_kernel_expression(&mut visitor_state, exprid) { + println!("Got predicate: {}", predicate); + scan_builder = scan_builder.with_predicate(*predicate); + } + } + let scan_adds = scan_builder + .build(table_client.as_ref()) + .unwrap() + .files(table_client.as_ref()) + .unwrap(); + let mut file_count = 0; + let mut files: Vec<*mut i8> = scan_adds + .into_iter() + .map(|add| { + file_count += 1; + CString::new(add.unwrap().path).unwrap().into_raw() + }) + .collect(); + let ptr = files.as_mut_ptr(); + std::mem::forget(files); + println!("{} files survived pruning", file_count); + FileList { + files: ptr, + file_count, + } +} From 7802726a44e53b72b157062a5d6982265c044b7a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 27 Jan 2024 14:08:37 -0800 Subject: [PATCH 02/27] simplify AND/OR logic --- kernel/src/client/expression.rs | 10 +++---- kernel/src/expressions.rs | 49 ++++++++++++++------------------- 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/kernel/src/client/expression.rs b/kernel/src/client/expression.rs index e16a7c93a..854902573 100644 --- a/kernel/src/client/expression.rs +++ b/kernel/src/client/expression.rs @@ -121,9 +121,10 @@ fn evaluate_expression(expression: &Expression, batch: &RecordBatch) -> DeltaRes }) } VariadicOperation { op, exprs } => { - let reducer = match op { - VariadicOperator::And => and, - VariadicOperator::Or => or, + type Operation = fn(&BooleanArray, &BooleanArray) -> Result; + let (reducer, default): (Operation, _) = match op { + VariadicOperator::And => (and, true), + VariadicOperator::Or => (or, false), }; exprs .iter() @@ -132,8 +133,7 @@ fn evaluate_expression(expression: &Expression, batch: &RecordBatch) -> DeltaRes Ok(reducer(downcast_to_bool(&l?)?, downcast_to_bool(&r?)?) .map(wrap_comparison_result)?) }) - .transpose()? - .ok_or(Error::Generic("empty expression".to_string())) + .unwrap_or_else(|| evaluate_expression(&Expression::literal(default), batch)) } } } diff --git a/kernel/src/expressions.rs b/kernel/src/expressions.rs index d933a900f..87b506893 100644 --- a/kernel/src/expressions.rs +++ b/kernel/src/expressions.rs @@ -156,6 +156,19 @@ impl Expression { Self::Literal(value.into()) } + pub fn and_from(exprs: impl IntoIterator) -> Self { + Self::variadic_op_from(exprs, VariadicOperator::And) + } + + pub fn or_from(exprs: impl IntoIterator) -> Self { + Self::variadic_op_from(exprs, VariadicOperator::Or) + } + + fn variadic_op_from(exprs: impl IntoIterator, op: VariadicOperator) -> Self { + let exprs = exprs.into_iter().collect::>(); + Self::VariadicOperation {op, exprs} + } + fn binary_op_impl(self, other: Self, op: BinaryOperator) -> Self { Self::BinaryOperation { op, @@ -164,15 +177,6 @@ impl Expression { } } - fn variadic_op_impl(self, other: impl IntoIterator, op: VariadicOperator) -> Self { - let mut exprs = other.into_iter().collect::>(); - if exprs.is_empty() { - return self; - } - exprs.insert(0, self); - Self::VariadicOperation { op, exprs } - } - /// Create a new expression `self == other` pub fn eq(self, other: Self) -> Self { self.binary_op_impl(other, BinaryOperator::Equal) @@ -205,22 +209,12 @@ impl Expression { /// Create a new expression `self AND other` pub fn and(self, other: Self) -> Self { - self.and_many([other]) - } - - /// Create a new expression `self AND others` - pub fn and_many(self, other: impl IntoIterator) -> Self { - self.variadic_op_impl(other, VariadicOperator::And) - } - - /// Create a new expression `self AND other` - pub fn or(self, other: Self) -> Self { - self.or_many([other]) + Self::and_from([self, other]) } /// Create a new expression `self OR other` - pub fn or_many(self, other: impl IntoIterator) -> Self { - self.variadic_op_impl(other, VariadicOperator::Or) + pub fn or(self, other: Self) -> Self { + Self::or_from([self, other]) } fn walk(&self) -> impl Iterator + '_ { @@ -237,11 +231,9 @@ impl Expression { Self::UnaryOperation { expr, .. } => { stack.push(expr); } - Self::VariadicOperation { op, exprs } => match op { - VariadicOperator::And | VariadicOperator::Or => { - stack.extend(exprs.iter()); - } - }, + Self::VariadicOperation { exprs, .. } => { + stack.extend(exprs.iter()); + } } Some(expr) }) @@ -306,7 +298,8 @@ mod tests { "AND(Column(x) >= 2, Column(x) <= 10)", ), ( - col_ref.clone().gt_eq(Expr::literal(2)).and_many([ + Expr::and_from([ + col_ref.clone().gt_eq(Expr::literal(2)), col_ref.clone().lt_eq(Expr::literal(10)), col_ref.clone().lt_eq(Expr::literal(100)), ]), From 8be8a37f44419ae319bf26321ce40a2a099adc85 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 27 Jan 2024 14:16:03 -0800 Subject: [PATCH 03/27] Fix FFI to match latest kernel code --- ffi/kernel.h | 4 +--- ffi/kernel.hpp | 4 +--- ffi/src/lib.rs | 42 ++++++++++-------------------------------- 3 files changed, 12 insertions(+), 38 deletions(-) diff --git a/ffi/kernel.h b/ffi/kernel.h index d6461fac3..ff63e0378 100644 --- a/ffi/kernel.h +++ b/ffi/kernel.h @@ -69,9 +69,7 @@ const Snapshot *snapshot(const char *path, const KernelDefaultTableClient *table */ uint64_t version(const Snapshot *snapshot); -uintptr_t visit_schema(const Snapshot *snapshot, - const KernelDefaultTableClient *table_client, - struct EngineSchemaVisitor *visitor); +uintptr_t visit_schema(const Snapshot *snapshot, struct EngineSchemaVisitor *visitor); uintptr_t visit_expression_and(struct KernelExpressionVisitorState *state, struct EngineIterator *children); diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp index a8208a639..ea0257876 100644 --- a/ffi/kernel.hpp +++ b/ffi/kernel.hpp @@ -62,9 +62,7 @@ const Snapshot *snapshot(const char *path, const KernelDefaultTableClient *table /// Get the version of the specified snapshot uint64_t version(const Snapshot *snapshot); -uintptr_t visit_schema(const Snapshot *snapshot, - const KernelDefaultTableClient *table_client, - EngineSchemaVisitor *visitor); +uintptr_t visit_schema(const Snapshot *snapshot, EngineSchemaVisitor *visitor); uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 94e3d8723..26eebb28c 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -141,7 +141,6 @@ pub struct EngineSchemaVisitor { #[no_mangle] pub extern "C" fn visit_schema( snapshot: &Snapshot, - table_client: &KernelDefaultTableClient, visitor: &mut EngineSchemaVisitor, ) -> usize { // Visit all the fields of a struct and return the list of children @@ -174,8 +173,7 @@ pub extern "C" fn visit_schema( } } - let schema: StructType = snapshot.schema(table_client).unwrap(); - visit_struct_fields(visitor, &schema) + visit_struct_fields(visitor, &snapshot.schema()) } // A set that can identify its contents by address @@ -269,8 +267,8 @@ fn unwrap_c_string(s: *const c_char) -> String { fn unwrap_kernel_expression( state: &mut KernelExpressionVisitorState, exprid: usize, -) -> Option> { - state.inflight_expressions.take(exprid).map(Box::new) +) -> Option { + state.inflight_expressions.take(exprid) } fn visit_expression_binary( @@ -279,8 +277,8 @@ fn visit_expression_binary( a: usize, b: usize, ) -> usize { - let left = unwrap_kernel_expression(state, a); - let right = unwrap_kernel_expression(state, b); + let left = unwrap_kernel_expression(state, a).map(Box::new); + let right = unwrap_kernel_expression(state, b).map(Box::new); match left.zip(right) { Some((left, right)) => { wrap_expression(state, Expression::BinaryOperation { op, left, right }) @@ -295,28 +293,9 @@ pub extern "C" fn visit_expression_and( state: &mut KernelExpressionVisitorState, children: &mut EngineIterator, ) -> usize { - let mut children = children.flat_map(|child| unwrap_kernel_expression(state, child as usize)); - let left = match children.next() { - Some(left) => left, - _ => return 0, - }; - let right = match children.next() { - Some(right) => right, - _ => return wrap_expression(state, *left), - }; - let mut result = Expression::BinaryOperation { - op: BinaryOperator::And, - left, - right, - }; - for child in children { - let left = Box::new(result); - result = Expression::BinaryOperation { - op: BinaryOperator::And, - left, - right: child, - }; - } + let result = Expression::and_from( + children.flat_map(|child| unwrap_kernel_expression(state, child as usize)) + ); wrap_expression(state, result) } @@ -424,12 +403,11 @@ pub unsafe extern "C" fn get_scan_files( let exprid = (predicate.visitor)(predicate.predicate, &mut visitor_state); if let Some(predicate) = unwrap_kernel_expression(&mut visitor_state, exprid) { println!("Got predicate: {}", predicate); - scan_builder = scan_builder.with_predicate(*predicate); + scan_builder = scan_builder.with_predicate(predicate); } } let scan_adds = scan_builder - .build(table_client.as_ref()) - .unwrap() + .build() .files(table_client.as_ref()) .unwrap(); let mut file_count = 0; From ff68f727ee0d23db9cf1de61f034146384b0d395 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 27 Jan 2024 14:03:30 -0800 Subject: [PATCH 04/27] scan files is now an iterator --- ffi/kernel.h | 21 +++++++++------ ffi/kernel.hpp | 21 +++++++++------ ffi/src/lib.rs | 73 +++++++++++++++++++++++++++++++++----------------- 3 files changed, 75 insertions(+), 40 deletions(-) diff --git a/ffi/kernel.h b/ffi/kernel.h index ff63e0378..f232fe19c 100644 --- a/ffi/kernel.h +++ b/ffi/kernel.h @@ -14,6 +14,8 @@ template struct DefaultTableClient; typedef struct KernelExpressionVisitorState KernelExpressionVisitorState; +typedef struct KernelScanFileIterator KernelScanFileIterator; + /** * Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap * the engine functions. The engine retains ownership of the iterator. @@ -42,11 +44,6 @@ typedef struct EngineSchemaVisitor { void (*visit_long)(void *data, uintptr_t sibling_list_id, const char *name); } EngineSchemaVisitor; -typedef struct FileList { - char **files; - int32_t file_count; -} FileList; - typedef struct EnginePredicate { void *predicate; uintptr_t (*visitor)(void *predicate, struct KernelExpressionVisitorState *state); @@ -99,6 +96,14 @@ uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *sta * * Caller is responsible to pass a valid snapshot pointer. */ -struct FileList get_scan_files(const Snapshot *snapshot, - const KernelDefaultTableClient *table_client, - struct EnginePredicate *predicate); +struct KernelScanFileIterator *kernel_scan_files_init(const Snapshot *snapshot, + const KernelDefaultTableClient *table_client, + struct EnginePredicate *predicate); + +void kernel_scan_files_next(struct KernelScanFileIterator *files, + void *engine_context, + void (*engine_visitor)(void *engine_context, + const char *ptr, + uintptr_t len)); + +void kernel_scan_files_free(struct KernelScanFileIterator *files); diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp index ea0257876..00b6cbcc0 100644 --- a/ffi/kernel.hpp +++ b/ffi/kernel.hpp @@ -15,6 +15,8 @@ template struct DefaultTableClient; struct KernelExpressionVisitorState; +struct KernelScanFileIterator; + /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap /// the engine functions. The engine retains ownership of the iterator. struct EngineIterator { @@ -39,11 +41,6 @@ struct EngineSchemaVisitor { void (*visit_long)(void *data, uintptr_t sibling_list_id, const char *name); }; -struct FileList { - char **files; - int32_t file_count; -}; - struct EnginePredicate { void *predicate; uintptr_t (*visitor)(void *predicate, KernelExpressionVisitorState *state); @@ -88,8 +85,16 @@ uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int /// # Safety /// /// Caller is responsible to pass a valid snapshot pointer. -FileList get_scan_files(const Snapshot *snapshot, - const KernelDefaultTableClient *table_client, - EnginePredicate *predicate); +KernelScanFileIterator *kernel_scan_files_init(const Snapshot *snapshot, + const KernelDefaultTableClient *table_client, + EnginePredicate *predicate); + +void kernel_scan_files_next(KernelScanFileIterator *files, + void *engine_context, + void (*engine_visitor)(void *engine_context, + const char *ptr, + uintptr_t len)); + +void kernel_scan_files_free(KernelScanFileIterator *files); } // extern "C" diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 26eebb28c..b94e90857 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::ffi::{CStr, CString}; use std::os::raw::{c_char, c_void}; use std::path::PathBuf; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use url::Url; use deltakernel::expressions::{scalars::Scalar, BinaryOperator, Expression}; @@ -173,7 +173,7 @@ pub extern "C" fn visit_schema( } } - visit_struct_fields(visitor, &snapshot.schema()) + visit_struct_fields(visitor, snapshot.schema()) } // A set that can identify its contents by address @@ -371,10 +371,25 @@ pub extern "C" fn visit_expression_literal_long( wrap_expression(state, Expression::Literal(Scalar::from(value))) } -#[repr(C)] -pub struct FileList { - files: *mut *mut c_char, - file_count: i32, +pub struct KernelScanFileIterator { + // Box -> Wrap its unsized content this struct is fixed-size with thin pointers. + // Mutex -> We need to protect the iterator against multi-threaded engine access. + // Item = String -> Owned items because rust can't correctly express lifetimes for borrowed items + // (we would need a way to assert that item lifetimes are bounded by the iterator's lifetime). + files: Box>>, + + // The iterator has an internal borrowed reference to the Snapshot it came from. Rust can't + // track that across the FFI boundary, so it's up to us to keep the Snapshot alive. + _snapshot: Arc, + + // The iterator also has an internal borrowed reference to the table client it came from. + _table_client: Arc, +} + +impl Drop for KernelScanFileIterator { + fn drop(&mut self) { + println!("dropping KernelScanFileIterator"); + } } /// Get a FileList for all the files that need to be read from the table. NB: This _consumes_ the @@ -384,14 +399,14 @@ pub struct FileList { /// /// Caller is responsible to pass a valid snapshot pointer. #[no_mangle] -pub unsafe extern "C" fn get_scan_files( +pub extern "C" fn kernel_scan_files_init( snapshot: *const Snapshot, table_client: *const KernelDefaultTableClient, predicate: Option<&mut EnginePredicate>, -) -> FileList { - let snapshot: Arc = unsafe { Arc::from_raw(snapshot) }; +) -> *mut KernelScanFileIterator { + let snapshot: Arc = unsafe { Arc::from_raw(snapshot.clone()) }; let table_client = unsafe { Arc::from_raw(table_client) }; - let mut scan_builder = ScanBuilder::new(snapshot); + let mut scan_builder = ScanBuilder::new(snapshot.clone()); if let Some(predicate) = predicate { // TODO: There is a lot of redundancy between the various visit_expression_XXX methods here, // vs. ProvidesMetadataFilter trait and the class hierarchy that supports it. Can we justify @@ -410,19 +425,29 @@ pub unsafe extern "C" fn get_scan_files( .build() .files(table_client.as_ref()) .unwrap(); - let mut file_count = 0; - let mut files: Vec<*mut i8> = scan_adds - .into_iter() - .map(|add| { - file_count += 1; - CString::new(add.unwrap().path).unwrap().into_raw() - }) - .collect(); - let ptr = files.as_mut_ptr(); - std::mem::forget(files); - println!("{} files survived pruning", file_count); - FileList { - files: ptr, - file_count, + let files = scan_adds.map(|add| add.unwrap().path); + let files = KernelScanFileIterator { + files: Box::new(Mutex::new(files)), + _snapshot: snapshot, + _table_client: table_client, + }; + Box::into_raw(Box::new(files)) +} + +#[no_mangle] +pub extern "C" fn kernel_scan_files_next( + files: &mut KernelScanFileIterator, + engine_context: *mut c_void, + engine_visitor: extern "C" fn(engine_context: *mut c_void, ptr: *const c_char, len: usize), +) { + let mut files = files.files.lock().unwrap(); + if let Some(file) = files.next() { + println!("Got file: {}", file); + (engine_visitor)(engine_context, file.as_ptr().cast(), file.len()); } } + +#[no_mangle] +pub extern "C" fn kernel_scan_files_free(files: *mut KernelScanFileIterator) { + let _ = unsafe { Box::from_raw(files) }; +} From d04a7372777ec52e32f6c20bd36260fc98a1ce86 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 29 Jan 2024 08:03:21 -0800 Subject: [PATCH 05/27] clippy/fmt --- ffi/src/lib.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 7e0945904..895ab22a3 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -8,7 +8,7 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex}; use url::Url; -use deltakernel::expressions::{Scalar, BinaryOperator, Expression}; +use deltakernel::expressions::{BinaryOperator, Expression, Scalar}; use deltakernel::scan::ScanBuilder; use deltakernel::schema::{DataType, PrimitiveType, StructField, StructType}; use deltakernel::snapshot::Snapshot; @@ -139,10 +139,7 @@ pub struct EngineSchemaVisitor { } #[no_mangle] -pub extern "C" fn visit_schema( - snapshot: &Snapshot, - visitor: &mut EngineSchemaVisitor, -) -> usize { +pub extern "C" fn visit_schema(snapshot: &Snapshot, visitor: &mut EngineSchemaVisitor) -> usize { // Visit all the fields of a struct and return the list of children fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, s.fields.len()); @@ -294,7 +291,7 @@ pub extern "C" fn visit_expression_and( children: &mut EngineIterator, ) -> usize { let result = Expression::and_from( - children.flat_map(|child| unwrap_kernel_expression(state, child as usize)) + children.flat_map(|child| unwrap_kernel_expression(state, child as usize)), ); wrap_expression(state, result) } @@ -404,7 +401,7 @@ pub extern "C" fn kernel_scan_files_init( table_client: *const KernelDefaultTableClient, predicate: Option<&mut EnginePredicate>, ) -> *mut KernelScanFileIterator { - let snapshot: Arc = unsafe { Arc::from_raw(snapshot.clone()) }; + let snapshot: Arc = unsafe { Arc::from_raw(snapshot) }; let table_client = unsafe { Arc::from_raw(table_client) }; let mut scan_builder = ScanBuilder::new(snapshot.clone()); if let Some(predicate) = predicate { @@ -421,10 +418,7 @@ pub extern "C" fn kernel_scan_files_init( scan_builder = scan_builder.with_predicate(predicate); } } - let scan_adds = scan_builder - .build() - .files(table_client.as_ref()) - .unwrap(); + let scan_adds = scan_builder.build().files(table_client.as_ref()).unwrap(); let files = scan_adds.map(|add| add.unwrap().path); let files = KernelScanFileIterator { files: Box::new(Mutex::new(files)), From 4623c5142ace69aed4127e528d46047ae30c76f8 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 29 Jan 2024 09:40:36 -0800 Subject: [PATCH 06/27] cargo fmt again... this time with --all option --- kernel/src/scan/data_skipping.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index f8dff1fda..2e3c6b25e 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -64,14 +64,8 @@ fn as_data_skipping_predicate(expr: &Expr) -> Option { } NotEqual => { let exprs = [ - Expr::gt( - Column(format!("minValues.{}", col)), - Literal(val.clone()), - ), - Expr::lt( - Column(format!("maxValues.{}", col)), - Literal(val.clone()), - ), + Expr::gt(Column(format!("minValues.{}", col)), Literal(val.clone())), + Expr::lt(Column(format!("maxValues.{}", col)), Literal(val.clone())), ]; return Some(Expr::or_from(exprs)); } From b0b1f62c1b720d8c4202547fa0713c06de7e25f0 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 20 Feb 2024 16:15:26 -0800 Subject: [PATCH 07/27] start supporting basic error handling and harmonize trait passing --- ffi/cbindgen.toml | 10 -- ffi/kernel.h | 132 ++++++++++++++++--- ffi/kernel.hpp | 104 ++++++++++++--- ffi/src/handle.rs | 202 ++++++++++++++++++++++++++++ ffi/src/lib.rs | 276 ++++++++++++++++++++++++++++++++------- kernel/src/client/mod.rs | 2 - 6 files changed, 629 insertions(+), 97 deletions(-) create mode 100644 ffi/src/handle.rs diff --git a/ffi/cbindgen.toml b/ffi/cbindgen.toml index 634a95a1f..e69de29bb 100644 --- a/ffi/cbindgen.toml +++ b/ffi/cbindgen.toml @@ -1,10 +0,0 @@ -after_includes = """ - -// ====================================================================================== -// Missing forward declarations from deltakernel crate, added manually via cbindgen.toml -// ====================================================================================== -struct Snapshot; -struct TokioBackgroundExecutor; -template struct DefaultTableClient; -// ====================================================================================== -""" diff --git a/ffi/kernel.h b/ffi/kernel.h index f232fe19c..52bf35484 100644 --- a/ffi/kernel.h +++ b/ffi/kernel.h @@ -3,19 +3,32 @@ #include #include -// ====================================================================================== -// Missing forward declarations from deltakernel crate, added manually via cbindgen.toml -// ====================================================================================== -struct Snapshot; -struct TokioBackgroundExecutor; -template struct DefaultTableClient; -// ====================================================================================== - +typedef enum KernelError { + UnknownError, + FFIError, + ArrowError, + GenericError, + ParquetError, + ObjectStoreError, + FileNotFoundError, + MissingColumnError, + UnexpectedColumnTypeError, + MissingDataError, + MissingVersionError, + DeletionVectorError, + InvalidUrlError, + MalformedJsonError, + MissingMetadataError, +} KernelError; + +typedef struct ExternTableClientHandle ExternTableClientHandle; typedef struct KernelExpressionVisitorState KernelExpressionVisitorState; typedef struct KernelScanFileIterator KernelScanFileIterator; +typedef struct SnapshotHandle SnapshotHandle; + /** * Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap * the engine functions. The engine retains ownership of the iterator. @@ -30,7 +43,51 @@ typedef struct EngineIterator { const void *(*get_next)(void *data); } EngineIterator; -typedef DefaultTableClient KernelDefaultTableClient; +/** + * An error that can be returned to the engine. Engines can define additional struct fields on + * their side, by e.g. embedding this struct as the first member of a larger struct. + */ +typedef struct EngineError { + enum KernelError etype; +} EngineError; + +typedef enum ExternResult______ExternTableClientHandle_Tag { + Ok______ExternTableClientHandle, + Err______ExternTableClientHandle, +} ExternResult______ExternTableClientHandle_Tag; + +typedef struct ExternResult______ExternTableClientHandle { + ExternResult______ExternTableClientHandle_Tag tag; + union { + struct { + const struct ExternTableClientHandle *ok; + }; + struct { + struct EngineError *err; + }; + }; +} ExternResult______ExternTableClientHandle; + +typedef struct EngineError *(*AllocateErrorFn)(enum KernelError etype, + const char *msg_ptr, + uintptr_t msg_len); + +typedef enum ExternResult______SnapshotHandle_Tag { + Ok______SnapshotHandle, + Err______SnapshotHandle, +} ExternResult______SnapshotHandle_Tag; + +typedef struct ExternResult______SnapshotHandle { + ExternResult______SnapshotHandle_Tag tag; + union { + struct { + const struct SnapshotHandle *ok; + }; + struct { + struct EngineError *err; + }; + }; +} ExternResult______SnapshotHandle; typedef struct EngineSchemaVisitor { void *data; @@ -54,19 +111,54 @@ typedef struct EnginePredicate { */ void iterate(struct EngineIterator *it); -const KernelDefaultTableClient *get_default_client(const char *path); +/** + * # Safety + * + * Caller is responsible to pass a valid path pointer. + */ +struct ExternResult______ExternTableClientHandle get_default_client(const char *path, + AllocateErrorFn allocate_error); + +/** + * # Safety + * + * Caller is responsible to pass a valid handle. + */ +void drop_table_client(const struct ExternTableClientHandle *table_client); /** * Get the latest snapshot from the specified table + * + * # Safety + * + * Caller is responsible to pass valid handles and path pointer. */ -const Snapshot *snapshot(const char *path, const KernelDefaultTableClient *table_client); +struct ExternResult______SnapshotHandle snapshot(const char *path, + const struct ExternTableClientHandle *table_client, + AllocateErrorFn allocate_error); + +/** + * # Safety + * + * Caller is responsible to pass a valid handle. + */ +void drop_snapshot(const struct SnapshotHandle *snapshot); /** * Get the version of the specified snapshot + * + * # Safety + * + * Caller is responsible to pass a valid handle. */ -uint64_t version(const Snapshot *snapshot); +uint64_t version(const struct SnapshotHandle *snapshot); -uintptr_t visit_schema(const Snapshot *snapshot, struct EngineSchemaVisitor *visitor); +/** + * # Safety + * + * Caller is responsible to pass a valid handle. + */ +uintptr_t visit_schema(const struct SnapshotHandle *snapshot, struct EngineSchemaVisitor *visitor); uintptr_t visit_expression_and(struct KernelExpressionVisitorState *state, struct EngineIterator *children); @@ -89,15 +181,13 @@ uintptr_t visit_expression_literal_string(struct KernelExpressionVisitorState *s uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *state, int64_t value); /** - * Get a FileList for all the files that need to be read from the table. NB: This _consumes_ the - * snapshot, it is no longer valid after making this call (TODO: We should probably fix this?) - * + * Get a FileList for all the files that need to be read from the table. * # Safety * * Caller is responsible to pass a valid snapshot pointer. */ -struct KernelScanFileIterator *kernel_scan_files_init(const Snapshot *snapshot, - const KernelDefaultTableClient *table_client, +struct KernelScanFileIterator *kernel_scan_files_init(const struct SnapshotHandle *snapshot, + const struct ExternTableClientHandle *table_client, struct EnginePredicate *predicate); void kernel_scan_files_next(struct KernelScanFileIterator *files, @@ -106,4 +196,10 @@ void kernel_scan_files_next(struct KernelScanFileIterator *files, const char *ptr, uintptr_t len)); +/** + * # Safety + * + * Caller is responsible to (at most once) pass a valid pointer returned by a call to + * [kernel_scan_files_init]. + */ void kernel_scan_files_free(struct KernelScanFileIterator *files); diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp index 00b6cbcc0..d86043d7a 100644 --- a/ffi/kernel.hpp +++ b/ffi/kernel.hpp @@ -4,19 +4,32 @@ #include #include -// ====================================================================================== -// Missing forward declarations from deltakernel crate, added manually via cbindgen.toml -// ====================================================================================== -struct Snapshot; -struct TokioBackgroundExecutor; -template struct DefaultTableClient; -// ====================================================================================== +enum class KernelError { + UnknownError, + FFIError, + ArrowError, + GenericError, + ParquetError, + ObjectStoreError, + FileNotFoundError, + MissingColumnError, + UnexpectedColumnTypeError, + MissingDataError, + MissingVersionError, + DeletionVectorError, + InvalidUrlError, + MalformedJsonError, + MissingMetadataError, +}; +struct ExternTableClientHandle; struct KernelExpressionVisitorState; struct KernelScanFileIterator; +struct SnapshotHandle; + /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap /// the engine functions. The engine retains ownership of the iterator. struct EngineIterator { @@ -27,7 +40,35 @@ struct EngineIterator { const void *(*get_next)(void *data); }; -using KernelDefaultTableClient = DefaultTableClient; +/// An error that can be returned to the engine. Engines can define additional struct fields on +/// their side, by e.g. embedding this struct as the first member of a larger struct. +struct EngineError { + KernelError etype; +}; + +template +struct ExternResult { + enum class Tag { + Ok, + Err, + }; + + struct Ok_Body { + T _0; + }; + + struct Err_Body { + EngineError *_0; + }; + + Tag tag; + union { + Ok_Body ok; + Err_Body err; + }; +}; + +using AllocateErrorFn = EngineError*(*)(KernelError etype, const char *msg_ptr, uintptr_t msg_len); struct EngineSchemaVisitor { void *data; @@ -51,15 +92,42 @@ extern "C" { /// test function to print for items. this assumes each item is an `int` void iterate(EngineIterator *it); -const KernelDefaultTableClient *get_default_client(const char *path); +/// # Safety +/// +/// Caller is responsible to pass a valid path pointer. +ExternResult get_default_client(const char *path, + AllocateErrorFn allocate_error); + +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +void drop_table_client(const ExternTableClientHandle *table_client); /// Get the latest snapshot from the specified table -const Snapshot *snapshot(const char *path, const KernelDefaultTableClient *table_client); +/// +/// # Safety +/// +/// Caller is responsible to pass valid handles and path pointer. +ExternResult snapshot(const char *path, + const ExternTableClientHandle *table_client, + AllocateErrorFn allocate_error); + +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +void drop_snapshot(const SnapshotHandle *snapshot); /// Get the version of the specified snapshot -uint64_t version(const Snapshot *snapshot); +/// +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +uint64_t version(const SnapshotHandle *snapshot); -uintptr_t visit_schema(const Snapshot *snapshot, EngineSchemaVisitor *visitor); +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +uintptr_t visit_schema(const SnapshotHandle *snapshot, EngineSchemaVisitor *visitor); uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); @@ -79,14 +147,12 @@ uintptr_t visit_expression_literal_string(KernelExpressionVisitorState *state, c uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int64_t value); -/// Get a FileList for all the files that need to be read from the table. NB: This _consumes_ the -/// snapshot, it is no longer valid after making this call (TODO: We should probably fix this?) -/// +/// Get a FileList for all the files that need to be read from the table. /// # Safety /// /// Caller is responsible to pass a valid snapshot pointer. -KernelScanFileIterator *kernel_scan_files_init(const Snapshot *snapshot, - const KernelDefaultTableClient *table_client, +KernelScanFileIterator *kernel_scan_files_init(const SnapshotHandle *snapshot, + const ExternTableClientHandle *table_client, EnginePredicate *predicate); void kernel_scan_files_next(KernelScanFileIterator *files, @@ -95,6 +161,10 @@ void kernel_scan_files_next(KernelScanFileIterator *files, const char *ptr, uintptr_t len)); +/// # Safety +/// +/// Caller is responsible to (at most once) pass a valid pointer returned by a call to +/// [kernel_scan_files_init]. void kernel_scan_files_free(KernelScanFileIterator *files); } // extern "C" diff --git a/ffi/src/handle.rs b/ffi/src/handle.rs new file mode 100644 index 000000000..9573d5ed7 --- /dev/null +++ b/ffi/src/handle.rs @@ -0,0 +1,202 @@ +use std::sync::Arc; + +mod uncreate { + /// A struct that cannot be instantiated by any rust code because this modeule exposes no public + /// constructor for it. + pub struct Uncreate { + _private: usize + } +} + +// Make it easy to use (the module was only used to enforce member privacy). +pub type Uncreate = uncreate::Uncreate; + +/// Helper trait that allows passing `Arc` across the FFI boundary as a thin-pointer handle +/// type. The pointer remains valid until freed by a call to [ArcHandle::drop_handle]. The handle is +/// strongly typed, in the sense that each handle type maps to a single `Target` type. +/// +/// Typically, the handle (`Self`) is an opaque struct (_not_ `repr(C)`) with an FFI-friendly name +/// name, containing an [Uncreate] member that prevents rust code from legally instantiating it. +/// +/// # Examples +/// +/// To define and use a handle for a trait or other unsized type: +/// ``` +/// // The (unsized) trait to pass across the FFI boundary +/// trait MyTrait { ... } +/// +/// // The handle that will represent `MyStruct` externally +/// struct MyTraitHandle { +/// _uncreate: Uncreate, +/// } +/// +/// // Connect the two +/// impl ArcHandle for MyTraitHandle { +/// type Target = dyn MyTrait; +/// } +/// +/// fn unsized_handle_example(val: Arc) { +/// let handle: *const MyTraitHandle = ArcHandle::into_handle(val); +/// let val2 = unsafe { ArcHandle::clone_as_arc(handle) }; +/// unsafe { ArcHandle::drop_handle(handle) }; +/// // `handle` is no longer safe to use, but `val2` (a cloned Arc) is still valid +/// } +/// ``` +/// +/// To define and use a handle optimized for a sized type, just impl [SizedArcHandle] (which +/// automatically impl [ArcHandle]), and use the [ArcHandle] API in the same way as for unsized: +/// +/// ``` +/// // The (sized) struct to pass across the FFI boundary struct +/// MyStruct { ... } +/// +/// // The handle that will represent `MyStruct` externally +/// struct MyStructHandle { +/// _uncreate: Uncreate, +/// } +/// +/// // Connect the two +/// impl SizedArcHandle for MyStructHandle { +/// type Target = MyStruct; +/// } +/// +/// fn sized_handle_example(val: Arc) { +/// let handle: *const MyStructHandle = ArcHandle::into_handle(val); +/// let val2 = unsafe { ArcHandle::clone_as_arc(handle) }; +/// unsafe { ArcHandle::drop_handle(handle) }; +/// // `handle` is no longer safe to use, but `val2` (a cloned Arc) is still valid +/// } +/// ``` +/// +/// # Safety +/// +/// In addition to being a raw pointer, a handle always points to an underlying allocation of some +/// other type. Thus, it is _ALWAYS_ incorrect to dereference (`*`) a handle or cast it to any other +/// type, (including `Target`). The only safe operations on a handle are to copy the pointer itself +/// by value and to invoke associated functions of `ArcHandle`. +/// +/// Additionally, this trait intentionally does _NOT_ expose anything like an `as_ref()` function, +/// because the resulting reference would have an arbitrary (= unnecessarily dangerous) lifetime +/// that cannot be safely enforced, even if we aggressively bound its lifetime. For example: +/// ``` +/// let handle: *const MyHandle = ArcHandle::into_handle(...); +/// let handle_copy = handle; // does NOT increment the Arc refcount +/// let my_ref = ArcHandle::as_ref(&handle); +/// ArcHandle::drop_handle(handle_copy); // still-borrowed `handle` can't prevent this +/// my_ref.some_access(); // Illegal access to dangling reference +/// ``` +/// +/// If a borrowed reference is needed, call [ArcHandle::clone_as_arc] and invoke [Arc::as_ref] on +/// the result. +/// ``` +/// let handle: *const MyHandle = ArcHandle::into_handle(...); +/// let handle_copy = handle; // does NOT increment the Arc refcount +/// let my_ref = ArcHandle::clone_as_arc(handle); // increments refcount +/// ArcHandle::drop_handle(handle); +/// my_ref.some_access(); // No problem. +/// ``` +/// +/// # Synchronization +/// +/// The handle (a raw pointer) is neither [Send] nor [Sync] by default: +/// +/// It is ONLY safe to manually implement [Send] for a handle (or a type that embeds it) if all of +/// the following conditions hold: +/// +/// * The handle's `Target` type is `Send` +/// +/// * Calls to [ArcHandle::clone_as_arc] and [ArcHandle::drop_handle] always obey their respective +/// safety requirements. In particular, this means proving (or at least affirming) that no handle +/// is ever accessed after passing it to [ArcHandle::drop_handle]. +/// +/// It is ONLY safe to implement [Sync] for a handle (or type embedding it) if all of the following +/// conditions hold: +/// +/// * The `Target` type is `Sync` +/// +/// * The handle is (correctly) `Send` +pub trait ArcHandle : Sized { + /// The target type this handle represents. + type Target : ?Sized; + + /// Converts the target Arc into a (leaked) "handle" that can cross the FFI boundary. The Arc + /// refcount does not change. The handle remains valid until passed to [Self::drop_handle]. + fn into_handle(target: Arc) -> *const Self { + Box::into_raw(Box::new(target)) as _ + } + + /// Clones an Arc from an FFI handle for kernel use, without dropping the handle. The Arc + /// refcount increases by one. + /// + /// # Safety + /// + /// Caller of this method asserts that `handle` satisfies all of the following conditions: + /// + /// * Obtained by calling [into_handle] + /// * Never cast to any other type nor dereferenced + /// * Not previously passed to [drop_handle] + unsafe fn clone_as_arc(handle: *const Self) -> Arc { + let ptr = handle as *mut Arc; + let arc = unsafe { &*ptr }; + arc.clone() + } + + /// Drops the handle, invalidating it. The Arc refcount decreases by one, which may drop the + /// underlying trait instance as well. + /// + /// # Safety + /// + /// Caller of this method asserts that `handle` satisfies all of the following conditions: + /// + /// * Obtained by calling [into_handle] + /// * Never cast to any other type nor dereferenced + /// * Not previously passed to [drop_handle] + unsafe fn drop_handle(handle: *const Self) { + let ptr = handle as *mut Arc; + let _ = unsafe { Box::from_raw(ptr) }; + } +} + +/// A special kind of [ArcHandle] which is optimized for [Sized] types. Handles for sized types are +/// more efficient if they implement this trait instead of [ArcHandle]. +pub trait SizedArcHandle : Sized { + type Target : Sized; +} + +// A blanket implementation of `ArcHandle` for all types satisfying `SizedArcHandle`. +// +// Unlike the default (unsized) implementation, which must wrap the Arc in a Box in order to obtain +// a thin pointer, the sized implementation can directly return a pointer to the Arc's +// underlying. This blanket implementation applies automatically to all types that implement +// SizedArcHandle, so a type that implements SizedArcHandle cannot directly implement ArcHandle +// (which is a Good Thing, because it preserves the important type safety invariant that every +// handle has exactly one `Target` type): +// +// ``` +// error[E0119]: conflicting implementations of trait `ArcHandle` for type `FooHandle` +// --> src/main.rs:55:1 +// | +// 28 | impl ArcHandle for H where H: SizedArcHandle { +// | -------------------------------------------------------------- first implementation here +// ... +// 55 | impl ArcHandle for FooHandle { +// | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ conflicting implementation for `FooHandle` +// ``` +impl ArcHandle for H where H: SizedArcHandle { + type Target = T; + + fn into_handle(target: Arc) -> *const Self { + Arc::into_raw(target) as _ + } + + unsafe fn clone_as_arc(handle: *const Self) -> Arc { + let ptr = handle as *const Self::Target; + unsafe { Arc::increment_strong_count(ptr) }; + unsafe { Arc::from_raw(ptr) } + } + + unsafe fn drop_handle(handle: *const Self) { + let ptr = handle as *const Self::Target; + let _ = unsafe { Arc::from_raw(ptr) }; + } +} diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 895ab22a3..2f2531c38 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -2,7 +2,9 @@ /// /// Exposes that an engine needs to call from C/C++ to interface with kernel use std::collections::HashMap; +use std::default::Default; use std::ffi::{CStr, CString}; +use std::fmt::{Display, Formatter}; use std::os::raw::{c_char, c_void}; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -12,6 +14,10 @@ use deltakernel::expressions::{BinaryOperator, Expression, Scalar}; use deltakernel::scan::ScanBuilder; use deltakernel::schema::{DataType, PrimitiveType, StructField, StructType}; use deltakernel::snapshot::Snapshot; +use deltakernel::{DeltaResult, Error, ExpressionHandler, FileSystemClient, JsonHandler, ParquetHandler, TableClient}; + +mod handle; +use handle::{ArcHandle, SizedArcHandle, Uncreate}; /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap /// the engine functions. The engine retains ownership of the iterator. @@ -50,69 +56,239 @@ impl Iterator for EngineIterator { } } -// TODO: We REALLY don't want to expose the real type of default client. Doing so forces every kernel -// API call to take either a default client or a generic client, because those are incompatible types. -use deltakernel::client::executor::tokio::TokioBackgroundExecutor; -type KernelDefaultTableClient = deltakernel::DefaultTableClient; +#[repr(C)] +pub enum KernelError { + UnknownError, // catch-all for unrecognized kernel Error types + FFIError, // errors encountered in the code layer that supports FFI + ArrowError, + GenericError, + ParquetError, + ObjectStoreError, + FileNotFoundError, + MissingColumnError, + UnexpectedColumnTypeError, + MissingDataError, + MissingVersionError, + DeletionVectorError, + InvalidUrlError, + MalformedJsonError, + MissingMetadataError, +} + +impl Display for KernelError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + KernelError::UnknownError => write!(f, "UnknownError"), + KernelError::FFIError => write!(f, "FFIError"), + KernelError::ArrowError => write!(f, "ArrowError"), + KernelError::GenericError => write!(f, "GenericError"), + KernelError::ParquetError => write!(f, "ParquetError"), + KernelError::ObjectStoreError => write!(f, "ObjectStoreError"), + KernelError::FileNotFoundError => write!(f, "FileNotFoundError"), + KernelError::MissingColumnError => write!(f, "MissingColumnError"), + KernelError::UnexpectedColumnTypeError => write!(f, "UnexpectedColumnTypeError"), + KernelError::MissingDataError => write!(f, "MissingDataError"), + KernelError::MissingVersionError => write!(f, "MissingVersionError"), + KernelError::DeletionVectorError => write!(f, "DeletionVectorError"), + KernelError::InvalidUrlError => write!(f, "InvalidUrlError"), + KernelError::MalformedJsonError => write!(f, "MalformedJsonError"), + KernelError::MissingMetadataError => write!(f, "MissingMetadataError"), + } + } +} + +impl From for KernelError { + fn from(e: Error) -> Self { + match e { + // NOTE: By definition, no kernel Error maps to FFIError + Error::Arrow(_) => KernelError::ArrowError, + Error::Generic(_) => KernelError::GenericError, + Error::GenericError { .. } => KernelError::GenericError, + Error::Parquet(_) => KernelError::ParquetError, + Error::ObjectStore(_) => KernelError::ObjectStoreError, + Error::FileNotFound(_) => KernelError::FileNotFoundError, + Error::MissingColumn(_) => KernelError::MissingColumnError, + Error::UnexpectedColumnType(_) => KernelError::UnexpectedColumnTypeError, + Error::MissingData(_) => KernelError::MissingDataError, + Error::MissingVersion => KernelError::MissingVersionError, + Error::DeletionVector(_) => KernelError::DeletionVectorError, + Error::InvalidUrl(_) => KernelError::InvalidUrlError, + Error::MalformedJson(_) => KernelError::MalformedJsonError, + Error::MissingMetadata => KernelError::MissingMetadataError, + } + } +} + +/// An error that can be returned to the engine. Engines can define additional struct fields on +/// their side, by e.g. embedding this struct as the first member of a larger struct. +#[repr(C)] +pub struct EngineError { + etype: KernelError, +} + +#[repr(C)] +pub enum ExternResult { + Ok(T), + Err(*mut EngineError), +} + +type AllocateErrorFn = extern "C" fn( + etype: KernelError, + msg_ptr: *const c_char, + msg_len: usize, +) -> *mut EngineError; + +// NOTE: We can't "just" impl From> because we require an error allocator. +impl ExternResult { + pub fn is_ok(&self) -> bool { + match self { + Self::Ok(_) => true, + Self::Err(_) => false, + } + } + pub fn is_err(&self) -> bool { + !self.is_ok() + } +} + + +trait IntoExternResult { + fn into_extern_result( + self, + allocate_error: AllocateErrorFn, + ) -> ExternResult; +} + +impl IntoExternResult for DeltaResult { + fn into_extern_result( + self, + allocate_error: AllocateErrorFn, + ) -> ExternResult { + match self { + Ok(ok) => ExternResult::Ok(ok), + Err(err) => { + let msg = format!("{}", err); + let err = allocate_error(err.into(), msg.as_ptr().cast(), msg.len()); + ExternResult::Err(err) + } + } + } +} + +// An extended version of TableClient which defines additional FFI-specific methods. +pub trait ExternTableClient { + /// The underlying table client instance to use + fn table_client(&self) -> Arc; + + /// Allocates a new error in engine memory and returns the resulting pointer. The engine is + /// expected to copy the passed-in message, which is only guaranteed to remain valid until the + /// call returns. Kernel will always immediately return the result of this method to the engine. + fn allocate_error(&self, error: Error) -> *mut EngineError; +} + +pub struct ExternTableClientHandle { + _uncreate: Uncreate, +} + +impl ArcHandle for ExternTableClientHandle { + type Target = dyn ExternTableClient; +} + +struct ExternTableClientVtable { + // Actual table client instance to use + client: Arc, + allocate_error: AllocateErrorFn, +} + +impl ExternTableClient for ExternTableClientVtable { + fn table_client(&self) -> Arc { + self.client.clone() + } + fn allocate_error(&self, error: Error) -> *mut EngineError { + let msg = format!("{}", error); + (self.allocate_error)(error.into(), msg.as_ptr().cast(), msg.len()) + } +} /// # Safety /// /// Caller is responsible to pass a valid path pointer. -unsafe fn unwrap_and_parse_path_as_url(path: *const c_char) -> Option { +unsafe fn unwrap_and_parse_path_as_url(path: *const c_char) -> DeltaResult { let path = unsafe { CStr::from_ptr(path) }; - let path = path.to_str().unwrap(); - let path = std::fs::canonicalize(PathBuf::from(path)); - let Ok(path) = path else { - println!("Couldn't open table: {}", path.err().unwrap()); - return None; - }; - let Ok(url) = Url::from_directory_path(path) else { - println!("Invalid url"); - return None; - }; - Some(url) + let path = path.to_str().map_err(Error::generic)?; + let path = std::fs::canonicalize(PathBuf::from(path)).map_err(Error::generic)?; + Url::from_directory_path(path).map_err(|_| Error::generic("Invalid url")) } #[no_mangle] pub unsafe extern "C" fn get_default_client( path: *const c_char, -) -> *const KernelDefaultTableClient { - let path = unsafe { unwrap_and_parse_path_as_url(path) }; - let Some(url) = path else { - return std::ptr::null_mut(); - }; - let table_client = KernelDefaultTableClient::try_new( + allocate_error: AllocateErrorFn, +) -> ExternResult<*const ExternTableClientHandle> { + get_default_client_impl(path, allocate_error).into_extern_result(allocate_error) +} + +unsafe fn get_default_client_impl( + path: *const c_char, + allocate_error: AllocateErrorFn, +) -> DeltaResult<*const ExternTableClientHandle> { + let url = unsafe { unwrap_and_parse_path_as_url(path) }?; + use deltakernel::client::executor::tokio::TokioBackgroundExecutor; + let table_client = deltakernel::DefaultTableClient::::try_new( &url, HashMap::::new(), Arc::new(TokioBackgroundExecutor::new()), ); - let Ok(table_client) = table_client else { - println!( - "Error creating table client: {}", - table_client.err().unwrap() - ); - return std::ptr::null_mut(); - }; - Arc::into_raw(Arc::new(table_client)) + let client = Arc::new(table_client.map_err(Error::generic)?); + let client: Arc = Arc::new(ExternTableClientVtable { + client, + allocate_error, + }); + Ok(ArcHandle::into_handle(client)) +} + +#[no_mangle] +pub unsafe extern "C" fn drop_table_client(table_client: *const ExternTableClientHandle) { + ArcHandle::drop_handle(table_client); +} + +pub struct SnapshotHandle { + _uncreate: Uncreate, +} + +impl SizedArcHandle for SnapshotHandle { + type Target = Snapshot; } /// Get the latest snapshot from the specified table #[no_mangle] pub unsafe extern "C" fn snapshot( path: *const c_char, - table_client: &KernelDefaultTableClient, -) -> *const Snapshot { - let path = unsafe { unwrap_and_parse_path_as_url(path) }; - let Some(url) = path else { - return std::ptr::null_mut(); - }; - let snapshot = Snapshot::try_new(url, table_client, None).unwrap(); - Arc::into_raw(snapshot) + table_client: *const ExternTableClientHandle, + allocate_error: AllocateErrorFn, +) -> ExternResult<*const SnapshotHandle> { + snapshot_impl(path, table_client).into_extern_result(allocate_error) +} + +unsafe fn snapshot_impl( + path: *const c_char, + table_client: *const ExternTableClientHandle, +) -> DeltaResult<*const SnapshotHandle> { + let url = unsafe { unwrap_and_parse_path_as_url(path) }?; + let table_client = unsafe { ArcHandle::clone_as_arc(table_client) }; + let snapshot = Snapshot::try_new(url, table_client.table_client().as_ref(), None)?; + Ok(ArcHandle::into_handle(snapshot)) +} + +#[no_mangle] +pub unsafe extern "C" fn drop_snapshot(snapshot: *const SnapshotHandle) { + ArcHandle::drop_handle(snapshot); } /// Get the version of the specified snapshot #[no_mangle] -pub extern "C" fn version(snapshot: &Snapshot) -> u64 { +pub unsafe extern "C" fn version(snapshot: *const SnapshotHandle) -> u64 { + let snapshot = unsafe { ArcHandle::clone_as_arc(snapshot) }; snapshot.version() } @@ -139,7 +315,8 @@ pub struct EngineSchemaVisitor { } #[no_mangle] -pub extern "C" fn visit_schema(snapshot: &Snapshot, visitor: &mut EngineSchemaVisitor) -> usize { +pub extern "C" fn visit_schema(snapshot: *const SnapshotHandle, visitor: &mut EngineSchemaVisitor) -> usize { + let snapshot = unsafe { ArcHandle::clone_as_arc(snapshot) }; // Visit all the fields of a struct and return the list of children fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, s.fields.len()); @@ -380,7 +557,7 @@ pub struct KernelScanFileIterator { _snapshot: Arc, // The iterator also has an internal borrowed reference to the table client it came from. - _table_client: Arc, + _table_client: Arc, } impl Drop for KernelScanFileIterator { @@ -389,20 +566,18 @@ impl Drop for KernelScanFileIterator { } } -/// Get a FileList for all the files that need to be read from the table. NB: This _consumes_ the -/// snapshot, it is no longer valid after making this call (TODO: We should probably fix this?) -/// +/// Get a FileList for all the files that need to be read from the table. /// # Safety /// /// Caller is responsible to pass a valid snapshot pointer. #[no_mangle] pub extern "C" fn kernel_scan_files_init( - snapshot: *const Snapshot, - table_client: *const KernelDefaultTableClient, + snapshot: *const SnapshotHandle, + table_client: *const ExternTableClientHandle, predicate: Option<&mut EnginePredicate>, ) -> *mut KernelScanFileIterator { - let snapshot: Arc = unsafe { Arc::from_raw(snapshot) }; - let table_client = unsafe { Arc::from_raw(table_client) }; + let snapshot = unsafe { ArcHandle::clone_as_arc(snapshot) }; + let table_client = unsafe { ArcHandle::clone_as_arc(table_client) }; let mut scan_builder = ScanBuilder::new(snapshot.clone()); if let Some(predicate) = predicate { // TODO: There is a lot of redundancy between the various visit_expression_XXX methods here, @@ -418,12 +593,13 @@ pub extern "C" fn kernel_scan_files_init( scan_builder = scan_builder.with_predicate(predicate); } } - let scan_adds = scan_builder.build().files(table_client.as_ref()).unwrap(); + let real_table_client = table_client.table_client(); + let scan_adds = scan_builder.build().files(real_table_client.as_ref()).unwrap(); let files = scan_adds.map(|add| add.unwrap().path); let files = KernelScanFileIterator { files: Box::new(Mutex::new(files)), _snapshot: snapshot, - _table_client: table_client, + _table_client: real_table_client, }; Box::into_raw(Box::new(files)) } diff --git a/kernel/src/client/mod.rs b/kernel/src/client/mod.rs index 9e1941573..47077e8b5 100644 --- a/kernel/src/client/mod.rs +++ b/kernel/src/client/mod.rs @@ -85,9 +85,7 @@ impl DefaultTableClient { expression: Arc::new(DefaultExpressionHandler {}), } } -} -impl DefaultTableClient { pub fn get_object_store_for_url(&self, _url: &Url) -> Option> { Some(self.store.clone()) } From 92abf11a11b5f12991e0c17f1152c2ba4062712f Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 23 Feb 2024 16:23:28 -0800 Subject: [PATCH 08/27] clippy and fmt --- ffi/src/handle.rs | 15 ++++++---- ffi/src/lib.rs | 72 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 61 insertions(+), 26 deletions(-) diff --git a/ffi/src/handle.rs b/ffi/src/handle.rs index 9573d5ed7..c5811a740 100644 --- a/ffi/src/handle.rs +++ b/ffi/src/handle.rs @@ -4,7 +4,7 @@ mod uncreate { /// A struct that cannot be instantiated by any rust code because this modeule exposes no public /// constructor for it. pub struct Uncreate { - _private: usize + _private: usize, } } @@ -115,9 +115,9 @@ pub type Uncreate = uncreate::Uncreate; /// * The `Target` type is `Sync` /// /// * The handle is (correctly) `Send` -pub trait ArcHandle : Sized { +pub trait ArcHandle: Sized { /// The target type this handle represents. - type Target : ?Sized; + type Target: ?Sized; /// Converts the target Arc into a (leaked) "handle" that can cross the FFI boundary. The Arc /// refcount does not change. The handle remains valid until passed to [Self::drop_handle]. @@ -159,8 +159,8 @@ pub trait ArcHandle : Sized { /// A special kind of [ArcHandle] which is optimized for [Sized] types. Handles for sized types are /// more efficient if they implement this trait instead of [ArcHandle]. -pub trait SizedArcHandle : Sized { - type Target : Sized; +pub trait SizedArcHandle: Sized { + type Target: Sized; } // A blanket implementation of `ArcHandle` for all types satisfying `SizedArcHandle`. @@ -182,7 +182,10 @@ pub trait SizedArcHandle : Sized { // 55 | impl ArcHandle for FooHandle { // | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ conflicting implementation for `FooHandle` // ``` -impl ArcHandle for H where H: SizedArcHandle { +impl ArcHandle for H +where + H: SizedArcHandle, +{ type Target = T; fn into_handle(target: Arc) -> *const Self { diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 2f2531c38..946929655 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -14,7 +14,7 @@ use deltakernel::expressions::{BinaryOperator, Expression, Scalar}; use deltakernel::scan::ScanBuilder; use deltakernel::schema::{DataType, PrimitiveType, StructField, StructType}; use deltakernel::snapshot::Snapshot; -use deltakernel::{DeltaResult, Error, ExpressionHandler, FileSystemClient, JsonHandler, ParquetHandler, TableClient}; +use deltakernel::{DeltaResult, Error, TableClient}; mod handle; use handle::{ArcHandle, SizedArcHandle, Uncreate}; @@ -59,7 +59,7 @@ impl Iterator for EngineIterator { #[repr(C)] pub enum KernelError { UnknownError, // catch-all for unrecognized kernel Error types - FFIError, // errors encountered in the code layer that supports FFI + FFIError, // errors encountered in the code layer that supports FFI ArrowError, GenericError, ParquetError, @@ -132,11 +132,8 @@ pub enum ExternResult { Err(*mut EngineError), } -type AllocateErrorFn = extern "C" fn( - etype: KernelError, - msg_ptr: *const c_char, - msg_len: usize, -) -> *mut EngineError; +type AllocateErrorFn = + extern "C" fn(etype: KernelError, msg_ptr: *const c_char, msg_len: usize) -> *mut EngineError; // NOTE: We can't "just" impl From> because we require an error allocator. impl ExternResult { @@ -151,19 +148,12 @@ impl ExternResult { } } - trait IntoExternResult { - fn into_extern_result( - self, - allocate_error: AllocateErrorFn, - ) -> ExternResult; + fn into_extern_result(self, allocate_error: AllocateErrorFn) -> ExternResult; } impl IntoExternResult for DeltaResult { - fn into_extern_result( - self, - allocate_error: AllocateErrorFn, - ) -> ExternResult { + fn into_extern_result(self, allocate_error: AllocateErrorFn) -> ExternResult { match self { Ok(ok) => ExternResult::Ok(ok), Err(err) => { @@ -200,6 +190,18 @@ struct ExternTableClientVtable { allocate_error: AllocateErrorFn, } +/// # Safety +/// +/// Kernel doesn't use any threading or concurrency. If engine chooses to do so, engine is +/// responsible to handle any races that could result. +unsafe impl Send for ExternTableClientVtable {} + +/// # Safety +/// +/// Kernel doesn't use any threading or concurrency. If engine chooses to do so, engine is +/// responsible to handle any races that could result. +unsafe impl Sync for ExternTableClientVtable {} + impl ExternTableClient for ExternTableClientVtable { fn table_client(&self) -> Arc { self.client.clone() @@ -220,6 +222,9 @@ unsafe fn unwrap_and_parse_path_as_url(path: *const c_char) -> DeltaResult Url::from_directory_path(path).map_err(|_| Error::generic("Invalid url")) } +/// # Safety +/// +/// Caller is responsible to pass a valid path pointer. #[no_mangle] pub unsafe extern "C" fn get_default_client( path: *const c_char, @@ -247,6 +252,9 @@ unsafe fn get_default_client_impl( Ok(ArcHandle::into_handle(client)) } +/// # Safety +/// +/// Caller is responsible to pass a valid handle. #[no_mangle] pub unsafe extern "C" fn drop_table_client(table_client: *const ExternTableClientHandle) { ArcHandle::drop_handle(table_client); @@ -261,6 +269,10 @@ impl SizedArcHandle for SnapshotHandle { } /// Get the latest snapshot from the specified table +/// +/// # Safety +/// +/// Caller is responsible to pass valid handles and path pointer. #[no_mangle] pub unsafe extern "C" fn snapshot( path: *const c_char, @@ -280,12 +292,19 @@ unsafe fn snapshot_impl( Ok(ArcHandle::into_handle(snapshot)) } +/// # Safety +/// +/// Caller is responsible to pass a valid handle. #[no_mangle] pub unsafe extern "C" fn drop_snapshot(snapshot: *const SnapshotHandle) { ArcHandle::drop_handle(snapshot); } /// Get the version of the specified snapshot +/// +/// # Safety +/// +/// Caller is responsible to pass a valid handle. #[no_mangle] pub unsafe extern "C" fn version(snapshot: *const SnapshotHandle) -> u64 { let snapshot = unsafe { ArcHandle::clone_as_arc(snapshot) }; @@ -314,8 +333,14 @@ pub struct EngineSchemaVisitor { visit_long: extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: *const c_char) -> (), } +/// # Safety +/// +/// Caller is responsible to pass a valid handle. #[no_mangle] -pub extern "C" fn visit_schema(snapshot: *const SnapshotHandle, visitor: &mut EngineSchemaVisitor) -> usize { +pub unsafe extern "C" fn visit_schema( + snapshot: *const SnapshotHandle, + visitor: &mut EngineSchemaVisitor, +) -> usize { let snapshot = unsafe { ArcHandle::clone_as_arc(snapshot) }; // Visit all the fields of a struct and return the list of children fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { @@ -571,7 +596,7 @@ impl Drop for KernelScanFileIterator { /// /// Caller is responsible to pass a valid snapshot pointer. #[no_mangle] -pub extern "C" fn kernel_scan_files_init( +pub unsafe extern "C" fn kernel_scan_files_init( snapshot: *const SnapshotHandle, table_client: *const ExternTableClientHandle, predicate: Option<&mut EnginePredicate>, @@ -594,7 +619,10 @@ pub extern "C" fn kernel_scan_files_init( } } let real_table_client = table_client.table_client(); - let scan_adds = scan_builder.build().files(real_table_client.as_ref()).unwrap(); + let scan_adds = scan_builder + .build() + .files(real_table_client.as_ref()) + .unwrap(); let files = scan_adds.map(|add| add.unwrap().path); let files = KernelScanFileIterator { files: Box::new(Mutex::new(files)), @@ -617,7 +645,11 @@ pub extern "C" fn kernel_scan_files_next( } } +/// # Safety +/// +/// Caller is responsible to (at most once) pass a valid pointer returned by a call to +/// [kernel_scan_files_init]. #[no_mangle] -pub extern "C" fn kernel_scan_files_free(files: *mut KernelScanFileIterator) { +pub unsafe extern "C" fn kernel_scan_files_free(files: *mut KernelScanFileIterator) { let _ = unsafe { Box::from_raw(files) }; } From 7a2179b0a55fb19452846de45bd529afe36623d7 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 26 Feb 2024 06:58:26 -0800 Subject: [PATCH 09/27] more string and error handling --- ffi/kernel.h | 114 ++++++++++++++++---- ffi/kernel.hpp | 70 ++++++++---- ffi/src/handle.rs | 18 ++++ ffi/src/lib.rs | 269 +++++++++++++++++++++++++++++++++------------- 4 files changed, 353 insertions(+), 118 deletions(-) diff --git a/ffi/kernel.h b/ffi/kernel.h index 52bf35484..d9e2fdb38 100644 --- a/ffi/kernel.h +++ b/ffi/kernel.h @@ -68,9 +68,32 @@ typedef struct ExternResult______ExternTableClientHandle { }; } ExternResult______ExternTableClientHandle; -typedef struct EngineError *(*AllocateErrorFn)(enum KernelError etype, - const char *msg_ptr, - uintptr_t msg_len); +/** + * A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The + * slice is only valid until the function it was passed into returns, and should not be copied. + * + * # Safety + * + * Intentionally not Copy, Clone, Send, nor Sync. + * + * Whoever instantiates the struct must ensure it does not outlive the data it points to. The + * compiler cannot help us here, because raw pointers don't have lifetimes. To reduce the risk of + * accidental misuse, it is recommended to only instantiate this struct as a function arg, by + * converting a `&str` value `Into`, so the borrowed reference protects the + * function call (callee must not retain any references to the slice after the call returns): + * + * ``` + * fn wants_slice(slice: KernelStringSlice) { ... } + * let msg = String::from(...); + * wants_slice(msg.as_ref().into()); + * ``` + */ +typedef struct KernelStringSlice { + const char *ptr; + uintptr_t len; +} KernelStringSlice; + +typedef struct EngineError *(*AllocateErrorFn)(enum KernelError etype, struct KernelStringSlice msg); typedef enum ExternResult______SnapshotHandle_Tag { Ok______SnapshotHandle, @@ -94,18 +117,52 @@ typedef struct EngineSchemaVisitor { uintptr_t (*make_field_list)(void *data, uintptr_t reserve); void (*visit_struct)(void *data, uintptr_t sibling_list_id, - const char *name, + struct KernelStringSlice name, uintptr_t child_list_id); - void (*visit_string)(void *data, uintptr_t sibling_list_id, const char *name); - void (*visit_integer)(void *data, uintptr_t sibling_list_id, const char *name); - void (*visit_long)(void *data, uintptr_t sibling_list_id, const char *name); + void (*visit_string)(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name); + void (*visit_integer)(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name); + void (*visit_long)(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name); } EngineSchemaVisitor; +typedef enum ExternResult_____KernelScanFileIterator_Tag { + Ok_____KernelScanFileIterator, + Err_____KernelScanFileIterator, +} ExternResult_____KernelScanFileIterator_Tag; + +typedef struct ExternResult_____KernelScanFileIterator { + ExternResult_____KernelScanFileIterator_Tag tag; + union { + struct { + struct KernelScanFileIterator *ok; + }; + struct { + struct EngineError *err; + }; + }; +} ExternResult_____KernelScanFileIterator; + typedef struct EnginePredicate { void *predicate; uintptr_t (*visitor)(void *predicate, struct KernelExpressionVisitorState *state); } EnginePredicate; +typedef enum ExternResult_bool_Tag { + Ok_bool, + Err_bool, +} ExternResult_bool_Tag; + +typedef struct ExternResult_bool { + ExternResult_bool_Tag tag; + union { + struct { + bool ok; + }; + struct { + struct EngineError *err; + }; + }; +} ExternResult_bool; + /** * test function to print for items. this assumes each item is an `int` */ @@ -116,7 +173,7 @@ void iterate(struct EngineIterator *it); * * Caller is responsible to pass a valid path pointer. */ -struct ExternResult______ExternTableClientHandle get_default_client(const char *path, +struct ExternResult______ExternTableClientHandle get_default_client(struct KernelStringSlice path, AllocateErrorFn allocate_error); /** @@ -133,9 +190,8 @@ void drop_table_client(const struct ExternTableClientHandle *table_client); * * Caller is responsible to pass valid handles and path pointer. */ -struct ExternResult______SnapshotHandle snapshot(const char *path, - const struct ExternTableClientHandle *table_client, - AllocateErrorFn allocate_error); +struct ExternResult______SnapshotHandle snapshot(struct KernelStringSlice path, + const struct ExternTableClientHandle *table_client); /** * # Safety @@ -173,10 +229,19 @@ uintptr_t visit_expression_ge(struct KernelExpressionVisitorState *state, uintpt uintptr_t visit_expression_eq(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); -uintptr_t visit_expression_column(struct KernelExpressionVisitorState *state, const char *name); +/** + * # Safety + * The string slice must be valid + */ +uintptr_t visit_expression_column(struct KernelExpressionVisitorState *state, + struct KernelStringSlice name); +/** + * # Safety + * The string slice must be valid + */ uintptr_t visit_expression_literal_string(struct KernelExpressionVisitorState *state, - const char *value); + struct KernelStringSlice value); uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *state, int64_t value); @@ -186,15 +251,20 @@ uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *sta * * Caller is responsible to pass a valid snapshot pointer. */ -struct KernelScanFileIterator *kernel_scan_files_init(const struct SnapshotHandle *snapshot, - const struct ExternTableClientHandle *table_client, - struct EnginePredicate *predicate); - -void kernel_scan_files_next(struct KernelScanFileIterator *files, - void *engine_context, - void (*engine_visitor)(void *engine_context, - const char *ptr, - uintptr_t len)); +struct ExternResult_____KernelScanFileIterator kernel_scan_files_init(const struct SnapshotHandle *snapshot, + const struct ExternTableClientHandle *table_client, + struct EnginePredicate *predicate); + +/** + * # Safety + * + * The iterator must be valid (returned by [kernel_scan_files_init]) and not yet freed by + * [kernel_scan_files_free]. The visitor function pointer must be non-null. + */ +struct ExternResult_bool kernel_scan_files_next(struct KernelScanFileIterator *files, + void *engine_context, + void (*engine_visitor)(void *engine_context, + struct KernelStringSlice file_name)); /** * # Safety diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp index d86043d7a..55f0c384c 100644 --- a/ffi/kernel.hpp +++ b/ffi/kernel.hpp @@ -68,18 +68,41 @@ struct ExternResult { }; }; -using AllocateErrorFn = EngineError*(*)(KernelError etype, const char *msg_ptr, uintptr_t msg_len); +/// A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The +/// slice is only valid until the function it was passed into returns, and should not be copied. +/// +/// # Safety +/// +/// Intentionally not Copy, Clone, Send, nor Sync. +/// +/// Whoever instantiates the struct must ensure it does not outlive the data it points to. The +/// compiler cannot help us here, because raw pointers don't have lifetimes. To reduce the risk of +/// accidental misuse, it is recommended to only instantiate this struct as a function arg, by +/// converting a `&str` value `Into`, so the borrowed reference protects the +/// function call (callee must not retain any references to the slice after the call returns): +/// +/// ``` +/// fn wants_slice(slice: KernelStringSlice) { ... } +/// let msg = String::from(...); +/// wants_slice(msg.as_ref().into()); +/// ``` +struct KernelStringSlice { + const char *ptr; + uintptr_t len; +}; + +using AllocateErrorFn = EngineError*(*)(KernelError etype, KernelStringSlice msg); struct EngineSchemaVisitor { void *data; uintptr_t (*make_field_list)(void *data, uintptr_t reserve); void (*visit_struct)(void *data, uintptr_t sibling_list_id, - const char *name, + KernelStringSlice name, uintptr_t child_list_id); - void (*visit_string)(void *data, uintptr_t sibling_list_id, const char *name); - void (*visit_integer)(void *data, uintptr_t sibling_list_id, const char *name); - void (*visit_long)(void *data, uintptr_t sibling_list_id, const char *name); + void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); }; struct EnginePredicate { @@ -95,7 +118,7 @@ void iterate(EngineIterator *it); /// # Safety /// /// Caller is responsible to pass a valid path pointer. -ExternResult get_default_client(const char *path, +ExternResult get_default_client(KernelStringSlice path, AllocateErrorFn allocate_error); /// # Safety @@ -108,9 +131,8 @@ void drop_table_client(const ExternTableClientHandle *table_client); /// # Safety /// /// Caller is responsible to pass valid handles and path pointer. -ExternResult snapshot(const char *path, - const ExternTableClientHandle *table_client, - AllocateErrorFn allocate_error); +ExternResult snapshot(KernelStringSlice path, + const ExternTableClientHandle *table_client); /// # Safety /// @@ -141,9 +163,14 @@ uintptr_t visit_expression_ge(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t visit_expression_eq(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); -uintptr_t visit_expression_column(KernelExpressionVisitorState *state, const char *name); +/// # Safety +/// The string slice must be valid +uintptr_t visit_expression_column(KernelExpressionVisitorState *state, KernelStringSlice name); -uintptr_t visit_expression_literal_string(KernelExpressionVisitorState *state, const char *value); +/// # Safety +/// The string slice must be valid +uintptr_t visit_expression_literal_string(KernelExpressionVisitorState *state, + KernelStringSlice value); uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int64_t value); @@ -151,15 +178,18 @@ uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int /// # Safety /// /// Caller is responsible to pass a valid snapshot pointer. -KernelScanFileIterator *kernel_scan_files_init(const SnapshotHandle *snapshot, - const ExternTableClientHandle *table_client, - EnginePredicate *predicate); - -void kernel_scan_files_next(KernelScanFileIterator *files, - void *engine_context, - void (*engine_visitor)(void *engine_context, - const char *ptr, - uintptr_t len)); +ExternResult kernel_scan_files_init(const SnapshotHandle *snapshot, + const ExternTableClientHandle *table_client, + EnginePredicate *predicate); + +/// # Safety +/// +/// The iterator must be valid (returned by [kernel_scan_files_init]) and not yet freed by +/// [kernel_scan_files_free]. The visitor function pointer must be non-null. +ExternResult kernel_scan_files_next(KernelScanFileIterator *files, + void *engine_context, + void (*engine_visitor)(void *engine_context, + KernelStringSlice file_name)); /// # Safety /// diff --git a/ffi/src/handle.rs b/ffi/src/handle.rs index c5811a740..9e9e41ba5 100644 --- a/ffi/src/handle.rs +++ b/ffi/src/handle.rs @@ -1,5 +1,22 @@ use std::sync::Arc; +/// Helper trait that simplifies passing an instance of a Sized type across the FFI boundary as a +/// leaked thin pointer. Does not include any reference-counting capability, so engine is +/// responsible to do whatever reference counting the engine may need. Engine is also responsible +/// not to drop an in-use handle, so kernel code can safely dereference the pointer. +pub trait BoxHandle: Sized { + fn into_handle(self) -> *mut Self { + Box::into_raw(Box::new(self)) + } + /// # Safety + /// + /// The `handle` was returned by a call to `into_handle`, has not already been passed to + /// `drop_handle`, and has no other live references. + unsafe fn drop_handle(handle: *mut Self) { + let _ = unsafe { Box::from_raw(handle) }; + } +} + mod uncreate { /// A struct that cannot be instantiated by any rust code because this modeule exposes no public /// constructor for it. @@ -151,6 +168,7 @@ pub trait ArcHandle: Sized { /// * Obtained by calling [into_handle] /// * Never cast to any other type nor dereferenced /// * Not previously passed to [drop_handle] + /// * Has no other live references unsafe fn drop_handle(handle: *const Self) { let ptr = handle as *mut Arc; let _ = unsafe { Box::from_raw(ptr) }; diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 946929655..e14ba9fe4 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -3,21 +3,21 @@ /// Exposes that an engine needs to call from C/C++ to interface with kernel use std::collections::HashMap; use std::default::Default; -use std::ffi::{CStr, CString}; use std::fmt::{Display, Formatter}; use std::os::raw::{c_char, c_void}; use std::path::PathBuf; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use url::Url; use deltakernel::expressions::{BinaryOperator, Expression, Scalar}; use deltakernel::scan::ScanBuilder; use deltakernel::schema::{DataType, PrimitiveType, StructField, StructType}; use deltakernel::snapshot::Snapshot; +use deltakernel::Add; use deltakernel::{DeltaResult, Error, TableClient}; mod handle; -use handle::{ArcHandle, SizedArcHandle, Uncreate}; +use handle::{ArcHandle, BoxHandle, SizedArcHandle, Uncreate}; /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap /// the engine functions. The engine retains ownership of the iterator. @@ -56,6 +56,59 @@ impl Iterator for EngineIterator { } } +/// A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The +/// slice is only valid until the function it was passed into returns, and should not be copied. +/// +/// # Safety +/// +/// Intentionally not Copy, Clone, Send, nor Sync. +/// +/// Whoever instantiates the struct must ensure it does not outlive the data it points to. The +/// compiler cannot help us here, because raw pointers don't have lifetimes. To reduce the risk of +/// accidental misuse, it is recommended to only instantiate this struct as a function arg, by +/// converting a `&str` value `Into`, so the borrowed reference protects the +/// function call (callee must not retain any references to the slice after the call returns): +/// +/// ``` +/// fn wants_slice(slice: KernelStringSlice) { ... } +/// let msg = String::from(...); +/// wants_slice(msg.as_ref().into()); +/// ``` +#[repr(C)] +pub struct KernelStringSlice { + ptr: *const c_char, + len: usize, +} + +// Intentionally not From, in order to reduce risk of accidental misuse. The main use is for callers +// to pass e.g. `my_str.as_str().into()` as a function arg. +#[allow(clippy::from_over_into)] +impl Into for &str { + fn into(self) -> KernelStringSlice { + KernelStringSlice { + ptr: self.as_ptr().cast(), + len: self.len(), + } + } +} + +trait FromStringSlice { + unsafe fn from_slice(slice: KernelStringSlice) -> Self; +} + +impl FromStringSlice for String { + /// Converts a slice back to a string + /// + /// # Safety + /// + /// The slice must be a valid (non-null) pointer, and must point to the indicated number of + /// valid utf8 bytes. + unsafe fn from_slice(slice: KernelStringSlice) -> String { + let slice = std::slice::from_raw_parts(slice.ptr.cast(), slice.len); + std::str::from_utf8_unchecked(slice).to_string() + } +} + #[repr(C)] pub enum KernelError { UnknownError, // catch-all for unrecognized kernel Error types @@ -133,7 +186,7 @@ pub enum ExternResult { } type AllocateErrorFn = - extern "C" fn(etype: KernelError, msg_ptr: *const c_char, msg_len: usize) -> *mut EngineError; + extern "C" fn(etype: KernelError, msg: KernelStringSlice) -> *mut EngineError; // NOTE: We can't "just" impl From> because we require an error allocator. impl ExternResult { @@ -148,32 +201,83 @@ impl ExternResult { } } +/// Represents an engine error allocator. Ultimately all implementations will fall back to an +/// [AllocateErrorFn] provided by the engine, but the trait allows us to conveniently access the +/// allocator in various types that may wrap it. +pub trait AllocateError { + /// Allocates a new error in engine memory and returns the resulting pointer. The engine is + /// expected to copy the passed-in message, which is only guaranteed to remain valid until the + /// call returns. Kernel will always immediately return the result of this method to the engine. + /// + /// # Safety + /// + /// The string slice must be valid until the call returns, and the error allocator must also be + /// valid. + unsafe fn allocate_error(&self, etype: KernelError, msg: KernelStringSlice) + -> *mut EngineError; +} + +// TODO: Why is this even needed... +impl AllocateError for &dyn AllocateError { + unsafe fn allocate_error( + &self, + etype: KernelError, + msg: KernelStringSlice, + ) -> *mut EngineError { + (*self).allocate_error(etype, msg) + } +} + +impl AllocateError for AllocateErrorFn { + unsafe fn allocate_error( + &self, + etype: KernelError, + msg: KernelStringSlice, + ) -> *mut EngineError { + self(etype, msg) + } +} +impl AllocateError for *const ExternTableClientHandle { + /// # Safety + /// + /// In addition to the usual requirements, the table client handle must be valid. + unsafe fn allocate_error( + &self, + etype: KernelError, + msg: KernelStringSlice, + ) -> *mut EngineError { + ArcHandle::clone_as_arc(*self) + .error_allocator() + .allocate_error(etype, msg) + } +} + +/// Converts a [DeltaResult] into an [ExternResult], using the engine's error allocator. +/// +/// # Safety +/// +/// The allocator must be valid. trait IntoExternResult { - fn into_extern_result(self, allocate_error: AllocateErrorFn) -> ExternResult; + unsafe fn into_extern_result(self, allocate_error: impl AllocateError) -> ExternResult; } impl IntoExternResult for DeltaResult { - fn into_extern_result(self, allocate_error: AllocateErrorFn) -> ExternResult { + unsafe fn into_extern_result(self, allocate_error: impl AllocateError) -> ExternResult { match self { Ok(ok) => ExternResult::Ok(ok), Err(err) => { let msg = format!("{}", err); - let err = allocate_error(err.into(), msg.as_ptr().cast(), msg.len()); + let err = unsafe { allocate_error.allocate_error(err.into(), msg.as_str().into()) }; ExternResult::Err(err) } } } } -// An extended version of TableClient which defines additional FFI-specific methods. +// A wrapper for TableClient which defines additional FFI-specific methods. pub trait ExternTableClient { - /// The underlying table client instance to use fn table_client(&self) -> Arc; - - /// Allocates a new error in engine memory and returns the resulting pointer. The engine is - /// expected to copy the passed-in message, which is only guaranteed to remain valid until the - /// call returns. Kernel will always immediately return the result of this method to the engine. - fn allocate_error(&self, error: Error) -> *mut EngineError; + fn error_allocator(&self) -> &dyn AllocateError; } pub struct ExternTableClientHandle { @@ -206,18 +310,16 @@ impl ExternTableClient for ExternTableClientVtable { fn table_client(&self) -> Arc { self.client.clone() } - fn allocate_error(&self, error: Error) -> *mut EngineError { - let msg = format!("{}", error); - (self.allocate_error)(error.into(), msg.as_ptr().cast(), msg.len()) + fn error_allocator(&self) -> &dyn AllocateError { + &self.allocate_error } } /// # Safety /// /// Caller is responsible to pass a valid path pointer. -unsafe fn unwrap_and_parse_path_as_url(path: *const c_char) -> DeltaResult { - let path = unsafe { CStr::from_ptr(path) }; - let path = path.to_str().map_err(Error::generic)?; +unsafe fn unwrap_and_parse_path_as_url(path: KernelStringSlice) -> DeltaResult { + let path = unsafe { String::from_slice(path) }; let path = std::fs::canonicalize(PathBuf::from(path)).map_err(Error::generic)?; Url::from_directory_path(path).map_err(|_| Error::generic("Invalid url")) } @@ -227,14 +329,14 @@ unsafe fn unwrap_and_parse_path_as_url(path: *const c_char) -> DeltaResult /// Caller is responsible to pass a valid path pointer. #[no_mangle] pub unsafe extern "C" fn get_default_client( - path: *const c_char, + path: KernelStringSlice, allocate_error: AllocateErrorFn, ) -> ExternResult<*const ExternTableClientHandle> { get_default_client_impl(path, allocate_error).into_extern_result(allocate_error) } unsafe fn get_default_client_impl( - path: *const c_char, + path: KernelStringSlice, allocate_error: AllocateErrorFn, ) -> DeltaResult<*const ExternTableClientHandle> { let url = unsafe { unwrap_and_parse_path_as_url(path) }?; @@ -275,20 +377,19 @@ impl SizedArcHandle for SnapshotHandle { /// Caller is responsible to pass valid handles and path pointer. #[no_mangle] pub unsafe extern "C" fn snapshot( - path: *const c_char, + path: KernelStringSlice, table_client: *const ExternTableClientHandle, - allocate_error: AllocateErrorFn, ) -> ExternResult<*const SnapshotHandle> { - snapshot_impl(path, table_client).into_extern_result(allocate_error) + snapshot_impl(path, table_client).into_extern_result(table_client) } unsafe fn snapshot_impl( - path: *const c_char, - table_client: *const ExternTableClientHandle, + path: KernelStringSlice, + extern_table_client: *const ExternTableClientHandle, ) -> DeltaResult<*const SnapshotHandle> { let url = unsafe { unwrap_and_parse_path_as_url(path) }?; - let table_client = unsafe { ArcHandle::clone_as_arc(table_client) }; - let snapshot = Snapshot::try_new(url, table_client.table_client().as_ref(), None)?; + let extern_table_client = unsafe { ArcHandle::clone_as_arc(extern_table_client) }; + let snapshot = Snapshot::try_new(url, extern_table_client.table_client().as_ref(), None)?; Ok(ArcHandle::into_handle(snapshot)) } @@ -311,7 +412,7 @@ pub unsafe extern "C" fn version(snapshot: *const SnapshotHandle) -> u64 { snapshot.version() } -// WARNING: the visitor MUST NOT retain internal references to the c_char names passed to visitor methods +// WARNING: the visitor MUST NOT retain internal references to the string slices passed to visitor methods // TODO: other types, nullability #[repr(C)] pub struct EngineSchemaVisitor { @@ -323,14 +424,15 @@ pub struct EngineSchemaVisitor { visit_struct: extern "C" fn( data: *mut c_void, sibling_list_id: usize, - name: *const c_char, + name: KernelStringSlice, child_list_id: usize, ) -> (), visit_string: - extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: *const c_char) -> (), + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice) -> (), visit_integer: - extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: *const c_char) -> (), - visit_long: extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: *const c_char) -> (), + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice) -> (), + visit_long: + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice) -> (), } /// # Safety @@ -353,20 +455,20 @@ pub unsafe extern "C" fn visit_schema( // Visit a struct field (recursively) and add the result to the list of siblings. fn visit_field(visitor: &EngineSchemaVisitor, sibling_list_id: usize, field: &StructField) { - let name = CString::new(field.name.as_bytes()).unwrap(); + let name: &str = field.name.as_ref(); match &field.data_type { DataType::Primitive(PrimitiveType::Integer) => { - (visitor.visit_integer)(visitor.data, sibling_list_id, name.as_ptr()) + (visitor.visit_integer)(visitor.data, sibling_list_id, name.into()) } DataType::Primitive(PrimitiveType::Long) => { - (visitor.visit_long)(visitor.data, sibling_list_id, name.as_ptr()) + (visitor.visit_long)(visitor.data, sibling_list_id, name.into()) } DataType::Primitive(PrimitiveType::String) => { - (visitor.visit_string)(visitor.data, sibling_list_id, name.as_ptr()) + (visitor.visit_string)(visitor.data, sibling_list_id, name.into()) } DataType::Struct(s) => { let child_list_id = visit_struct_fields(visitor, s); - (visitor.visit_struct)(visitor.data, sibling_list_id, name.as_ptr(), child_list_id); + (visitor.visit_struct)(visitor.data, sibling_list_id, name.into(), child_list_id); } other => println!("Unsupported data type: {}", other), } @@ -458,11 +560,6 @@ fn wrap_expression(state: &mut KernelExpressionVisitorState, expr: Expression) - state.inflight_expressions.insert(expr) } -fn unwrap_c_string(s: *const c_char) -> String { - let s = unsafe { CStr::from_ptr(s) }; - s.to_str().unwrap().to_string() -} - fn unwrap_kernel_expression( state: &mut KernelExpressionVisitorState, exprid: usize, @@ -543,22 +640,26 @@ pub extern "C" fn visit_expression_eq( visit_expression_binary(state, BinaryOperator::Equal, a, b) } +/// # Safety +/// The string slice must be valid #[no_mangle] -pub extern "C" fn visit_expression_column( +pub unsafe extern "C" fn visit_expression_column( state: &mut KernelExpressionVisitorState, - name: *const c_char, + name: KernelStringSlice, ) -> usize { - wrap_expression(state, Expression::Column(unwrap_c_string(name))) + wrap_expression(state, Expression::Column(String::from_slice(name))) } +/// # Safety +/// The string slice must be valid #[no_mangle] -pub extern "C" fn visit_expression_literal_string( +pub unsafe extern "C" fn visit_expression_literal_string( state: &mut KernelExpressionVisitorState, - value: *const c_char, + value: KernelStringSlice, ) -> usize { wrap_expression( state, - Expression::Literal(Scalar::from(unwrap_c_string(value))), + Expression::Literal(Scalar::from(String::from_slice(value))), ) } @@ -570,21 +671,19 @@ pub extern "C" fn visit_expression_literal_long( wrap_expression(state, Expression::Literal(Scalar::from(value))) } +// Intentionally opaque to the engine. pub struct KernelScanFileIterator { // Box -> Wrap its unsized content this struct is fixed-size with thin pointers. - // Mutex -> We need to protect the iterator against multi-threaded engine access. // Item = String -> Owned items because rust can't correctly express lifetimes for borrowed items // (we would need a way to assert that item lifetimes are bounded by the iterator's lifetime). - files: Box>>, - - // The iterator has an internal borrowed reference to the Snapshot it came from. Rust can't - // track that across the FFI boundary, so it's up to us to keep the Snapshot alive. - _snapshot: Arc, + files: Box>>, - // The iterator also has an internal borrowed reference to the table client it came from. - _table_client: Arc, + // Also keep a reference to the external client for its error allocator. + table_client: Arc, } +impl BoxHandle for KernelScanFileIterator {} + impl Drop for KernelScanFileIterator { fn drop(&mut self) { println!("dropping KernelScanFileIterator"); @@ -600,9 +699,17 @@ pub unsafe extern "C" fn kernel_scan_files_init( snapshot: *const SnapshotHandle, table_client: *const ExternTableClientHandle, predicate: Option<&mut EnginePredicate>, -) -> *mut KernelScanFileIterator { +) -> ExternResult<*mut KernelScanFileIterator> { + kernel_scan_files_init_impl(snapshot, table_client, predicate).into_extern_result(table_client) +} + +fn kernel_scan_files_init_impl( + snapshot: *const SnapshotHandle, + extern_table_client: *const ExternTableClientHandle, + predicate: Option<&mut EnginePredicate>, +) -> DeltaResult<*mut KernelScanFileIterator> { let snapshot = unsafe { ArcHandle::clone_as_arc(snapshot) }; - let table_client = unsafe { ArcHandle::clone_as_arc(table_client) }; + let extern_table_client = unsafe { ArcHandle::clone_as_arc(extern_table_client) }; let mut scan_builder = ScanBuilder::new(snapshot.clone()); if let Some(predicate) = predicate { // TODO: There is a lot of redundancy between the various visit_expression_XXX methods here, @@ -618,30 +725,40 @@ pub unsafe extern "C" fn kernel_scan_files_init( scan_builder = scan_builder.with_predicate(predicate); } } - let real_table_client = table_client.table_client(); let scan_adds = scan_builder .build() - .files(real_table_client.as_ref()) - .unwrap(); - let files = scan_adds.map(|add| add.unwrap().path); + .files(extern_table_client.table_client().as_ref())?; let files = KernelScanFileIterator { - files: Box::new(Mutex::new(files)), - _snapshot: snapshot, - _table_client: real_table_client, + files: Box::new(scan_adds), + table_client: extern_table_client, }; - Box::into_raw(Box::new(files)) + Ok(files.into_handle()) } +/// # Safety +/// +/// The iterator must be valid (returned by [kernel_scan_files_init]) and not yet freed by +/// [kernel_scan_files_free]. The visitor function pointer must be non-null. #[no_mangle] -pub extern "C" fn kernel_scan_files_next( +pub unsafe extern "C" fn kernel_scan_files_next( + files: &mut KernelScanFileIterator, + engine_context: *mut c_void, + engine_visitor: extern "C" fn(engine_context: *mut c_void, file_name: KernelStringSlice), +) -> ExternResult { + kernel_scan_files_next_impl(files, engine_context, engine_visitor) + .into_extern_result(files.table_client.error_allocator()) +} +fn kernel_scan_files_next_impl( files: &mut KernelScanFileIterator, engine_context: *mut c_void, - engine_visitor: extern "C" fn(engine_context: *mut c_void, ptr: *const c_char, len: usize), -) { - let mut files = files.files.lock().unwrap(); - if let Some(file) = files.next() { - println!("Got file: {}", file); - (engine_visitor)(engine_context, file.as_ptr().cast(), file.len()); + engine_visitor: extern "C" fn(engine_context: *mut c_void, file_name: KernelStringSlice), +) -> DeltaResult { + if let Some(add) = files.files.next().transpose()? { + println!("Got file: {}", add.path); + (engine_visitor)(engine_context, add.path.as_str().into()); + Ok(true) + } else { + Ok(false) } } @@ -651,5 +768,5 @@ pub extern "C" fn kernel_scan_files_next( /// [kernel_scan_files_init]. #[no_mangle] pub unsafe extern "C" fn kernel_scan_files_free(files: *mut KernelScanFileIterator) { - let _ = unsafe { Box::from_raw(files) }; + BoxHandle::drop_handle(files); } From f231ce0d609defc304f9ea1f9457279d2960cd4c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 26 Feb 2024 16:21:17 -0800 Subject: [PATCH 10/27] review comments --- ffi/kernel.h | 27 +++++++++++++++++---- ffi/kernel.hpp | 9 ++++--- ffi/src/handle.rs | 31 ++++++++++++++++-------- ffi/src/lib.rs | 61 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 92 insertions(+), 36 deletions(-) diff --git a/ffi/kernel.h b/ffi/kernel.h index d9e2fdb38..710c6a5d7 100644 --- a/ffi/kernel.h +++ b/ffi/kernel.h @@ -124,6 +124,23 @@ typedef struct EngineSchemaVisitor { void (*visit_long)(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name); } EngineSchemaVisitor; +typedef enum ExternResult_usize_Tag { + Ok_usize, + Err_usize, +} ExternResult_usize_Tag; + +typedef struct ExternResult_usize { + ExternResult_usize_Tag tag; + union { + struct { + uintptr_t ok; + }; + struct { + struct EngineError *err; + }; + }; +} ExternResult_usize; + typedef enum ExternResult_____KernelScanFileIterator_Tag { Ok_____KernelScanFileIterator, Err_____KernelScanFileIterator, @@ -233,15 +250,17 @@ uintptr_t visit_expression_eq(struct KernelExpressionVisitorState *state, uintpt * # Safety * The string slice must be valid */ -uintptr_t visit_expression_column(struct KernelExpressionVisitorState *state, - struct KernelStringSlice name); +struct ExternResult_usize visit_expression_column(struct KernelExpressionVisitorState *state, + struct KernelStringSlice name, + AllocateErrorFn allocate_error); /** * # Safety * The string slice must be valid */ -uintptr_t visit_expression_literal_string(struct KernelExpressionVisitorState *state, - struct KernelStringSlice value); +struct ExternResult_usize visit_expression_literal_string(struct KernelExpressionVisitorState *state, + struct KernelStringSlice value, + AllocateErrorFn allocate_error); uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *state, int64_t value); diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp index 55f0c384c..4369e983f 100644 --- a/ffi/kernel.hpp +++ b/ffi/kernel.hpp @@ -165,12 +165,15 @@ uintptr_t visit_expression_eq(KernelExpressionVisitorState *state, uintptr_t a, /// # Safety /// The string slice must be valid -uintptr_t visit_expression_column(KernelExpressionVisitorState *state, KernelStringSlice name); +ExternResult visit_expression_column(KernelExpressionVisitorState *state, + KernelStringSlice name, + AllocateErrorFn allocate_error); /// # Safety /// The string slice must be valid -uintptr_t visit_expression_literal_string(KernelExpressionVisitorState *state, - KernelStringSlice value); +ExternResult visit_expression_literal_string(KernelExpressionVisitorState *state, + KernelStringSlice value, + AllocateErrorFn allocate_error); uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int64_t value); diff --git a/ffi/src/handle.rs b/ffi/src/handle.rs index 9e9e41ba5..757519bcf 100644 --- a/ffi/src/handle.rs +++ b/ffi/src/handle.rs @@ -1,3 +1,14 @@ +//! A set of traits and types to help safely and ergonically pass rust objects across the FFI +//! boundary as thin pointer "handles". A handle is an opaque type that aids type safety by uniquely +//! identifying some Rust type that may not even be representable (such as traits). There are three +//! kinds of handles: +//! +//! * [BoxHandle] represents the content of a `Box`, where `T` is a sized type. +//! * [ArcHandle] represents the content of an `Arc`, where `T` may not be sized. +//! * [SizedArcHandle] specializes [ArcHandle] to handle sized types efficiently. +//! +//! Box handles are useful for passing owned sized data across the FFI boundary, while Arc handles +//! are more suitable for shared and/or unsized data. use std::sync::Arc; /// Helper trait that simplifies passing an instance of a Sized type across the FFI boundary as a @@ -17,23 +28,23 @@ pub trait BoxHandle: Sized { } } -mod uncreate { - /// A struct that cannot be instantiated by any rust code because this modeule exposes no public - /// constructor for it. - pub struct Uncreate { +mod unconstructable { + /// A struct that cannot be instantiated by any rust code because this module exposes no public + /// constructor for it. Intentionally _NOT_ a zero-sized type, to avoid weirdness in C/C++ land. + pub struct Unconstructable { _private: usize, } } // Make it easy to use (the module was only used to enforce member privacy). -pub type Uncreate = uncreate::Uncreate; +pub type Unconstructable = unconstructable::Unconstructable; /// Helper trait that allows passing `Arc` across the FFI boundary as a thin-pointer handle /// type. The pointer remains valid until freed by a call to [ArcHandle::drop_handle]. The handle is /// strongly typed, in the sense that each handle type maps to a single `Target` type. /// /// Typically, the handle (`Self`) is an opaque struct (_not_ `repr(C)`) with an FFI-friendly name -/// name, containing an [Uncreate] member that prevents rust code from legally instantiating it. +/// name, containing an [Unconstructable] member so rust code cannot legally instantiate it. /// /// # Examples /// @@ -44,7 +55,7 @@ pub type Uncreate = uncreate::Uncreate; /// /// // The handle that will represent `MyStruct` externally /// struct MyTraitHandle { -/// _uncreate: Uncreate, +/// _unconstructable: Unconstructable, /// } /// /// // Connect the two @@ -69,7 +80,7 @@ pub type Uncreate = uncreate::Uncreate; /// /// // The handle that will represent `MyStruct` externally /// struct MyStructHandle { -/// _uncreate: Uncreate, +/// _unconstructable: Unconstructable, /// } /// /// // Connect the two @@ -184,8 +195,8 @@ pub trait SizedArcHandle: Sized { // A blanket implementation of `ArcHandle` for all types satisfying `SizedArcHandle`. // // Unlike the default (unsized) implementation, which must wrap the Arc in a Box in order to obtain -// a thin pointer, the sized implementation can directly return a pointer to the Arc's -// underlying. This blanket implementation applies automatically to all types that implement +// a thin pointer, the sized implementation can directly return a pointer to the Arc's underlying +// type. This blanket implementation applies automatically to all types that implement // SizedArcHandle, so a type that implements SizedArcHandle cannot directly implement ArcHandle // (which is a Good Thing, because it preserves the important type safety invariant that every // handle has exactly one `Target` type): diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index e14ba9fe4..aa534ed76 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -17,7 +17,7 @@ use deltakernel::Add; use deltakernel::{DeltaResult, Error, TableClient}; mod handle; -use handle::{ArcHandle, BoxHandle, SizedArcHandle, Uncreate}; +use handle::{ArcHandle, BoxHandle, SizedArcHandle, Unconstructable}; /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap /// the engine functions. The engine retains ownership of the iterator. @@ -92,20 +92,23 @@ impl Into for &str { } } -trait FromStringSlice { - unsafe fn from_slice(slice: KernelStringSlice) -> Self; +trait TryFromStringSlice: Sized { + unsafe fn try_from_slice(slice: KernelStringSlice) -> DeltaResult; } -impl FromStringSlice for String { +impl TryFromStringSlice for String { /// Converts a slice back to a string /// /// # Safety /// /// The slice must be a valid (non-null) pointer, and must point to the indicated number of /// valid utf8 bytes. - unsafe fn from_slice(slice: KernelStringSlice) -> String { - let slice = std::slice::from_raw_parts(slice.ptr.cast(), slice.len); - std::str::from_utf8_unchecked(slice).to_string() + unsafe fn try_from_slice(slice: KernelStringSlice) -> DeltaResult { + let slice = unsafe { std::slice::from_raw_parts(slice.ptr.cast(), slice.len) }; + match std::str::from_utf8(slice) { + Ok(s) => Ok(s.to_string()), + Err(e) => Err(Error::generic_err(e)), + } } } @@ -172,8 +175,12 @@ impl From for KernelError { } } -/// An error that can be returned to the engine. Engines can define additional struct fields on -/// their side, by e.g. embedding this struct as the first member of a larger struct. +/// An error that can be returned to the engine. Engines that wish to associate additional +/// information can define and use any type that is [pointer +/// interconvertible](https://en.cppreference.com/w/cpp/language/static_cast#pointer-interconvertible) +/// with this one -- e.g. by subclassing this struct or by embedding this struct as the first member +/// of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) +/// class. #[repr(C)] pub struct EngineError { etype: KernelError, @@ -281,7 +288,7 @@ pub trait ExternTableClient { } pub struct ExternTableClientHandle { - _uncreate: Uncreate, + _unconstructable: Unconstructable, } impl ArcHandle for ExternTableClientHandle { @@ -319,8 +326,8 @@ impl ExternTableClient for ExternTableClientVtable { /// /// Caller is responsible to pass a valid path pointer. unsafe fn unwrap_and_parse_path_as_url(path: KernelStringSlice) -> DeltaResult { - let path = unsafe { String::from_slice(path) }; - let path = std::fs::canonicalize(PathBuf::from(path)).map_err(Error::generic)?; + let path = unsafe { String::try_from_slice(path) }; + let path = std::fs::canonicalize(PathBuf::from(path?)).map_err(Error::generic)?; Url::from_directory_path(path).map_err(|_| Error::generic("Invalid url")) } @@ -363,7 +370,7 @@ pub unsafe extern "C" fn drop_table_client(table_client: *const ExternTableClien } pub struct SnapshotHandle { - _uncreate: Uncreate, + _unconstructable: Unconstructable, } impl SizedArcHandle for SnapshotHandle { @@ -646,8 +653,16 @@ pub extern "C" fn visit_expression_eq( pub unsafe extern "C" fn visit_expression_column( state: &mut KernelExpressionVisitorState, name: KernelStringSlice, -) -> usize { - wrap_expression(state, Expression::Column(String::from_slice(name))) + allocate_error: AllocateErrorFn, +) -> ExternResult { + visit_expression_column_impl(state, name).into_extern_result(allocate_error) +} +unsafe fn visit_expression_column_impl( + state: &mut KernelExpressionVisitorState, + name: KernelStringSlice, +) -> DeltaResult { + let name = unsafe { String::try_from_slice(name) }; + Ok(wrap_expression(state, Expression::Column(name?))) } /// # Safety @@ -656,11 +671,19 @@ pub unsafe extern "C" fn visit_expression_column( pub unsafe extern "C" fn visit_expression_literal_string( state: &mut KernelExpressionVisitorState, value: KernelStringSlice, -) -> usize { - wrap_expression( + allocate_error: AllocateErrorFn, +) -> ExternResult { + visit_expression_literal_string_impl(state, value).into_extern_result(allocate_error) +} +unsafe fn visit_expression_literal_string_impl( + state: &mut KernelExpressionVisitorState, + value: KernelStringSlice, +) -> DeltaResult { + let value = unsafe { String::try_from_slice(value) }; + Ok(wrap_expression( state, - Expression::Literal(Scalar::from(String::from_slice(value))), - ) + Expression::Literal(Scalar::from(value?)), + )) } #[no_mangle] From d068865fd09dbbe40badab6d09c1526856da93ec Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 1 Mar 2024 13:49:46 -0800 Subject: [PATCH 11/27] update header files --- ffi/kernel.h | 8 ++++++-- ffi/kernel.hpp | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ffi/kernel.h b/ffi/kernel.h index 710c6a5d7..29d258a52 100644 --- a/ffi/kernel.h +++ b/ffi/kernel.h @@ -44,8 +44,12 @@ typedef struct EngineIterator { } EngineIterator; /** - * An error that can be returned to the engine. Engines can define additional struct fields on - * their side, by e.g. embedding this struct as the first member of a larger struct. + * An error that can be returned to the engine. Engines that wish to associate additional + * information can define and use any type that is [pointer + * interconvertible](https://en.cppreference.com/w/cpp/language/static_cast#pointer-interconvertible) + * with this one -- e.g. by subclassing this struct or by embedding this struct as the first member + * of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) + * class. */ typedef struct EngineError { enum KernelError etype; diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp index 4369e983f..c25bf5126 100644 --- a/ffi/kernel.hpp +++ b/ffi/kernel.hpp @@ -40,8 +40,12 @@ struct EngineIterator { const void *(*get_next)(void *data); }; -/// An error that can be returned to the engine. Engines can define additional struct fields on -/// their side, by e.g. embedding this struct as the first member of a larger struct. +/// An error that can be returned to the engine. Engines that wish to associate additional +/// information can define and use any type that is [pointer +/// interconvertible](https://en.cppreference.com/w/cpp/language/static_cast#pointer-interconvertible) +/// with this one -- e.g. by subclassing this struct or by embedding this struct as the first member +/// of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) +/// class. struct EngineError { KernelError etype; }; From 0b32b3b416513fac8a8355311693b593e9c61f59 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 25 Mar 2024 11:36:22 -0700 Subject: [PATCH 12/27] fix logical merge conflicts --- ffi/kernel.h | 38 ++++++++++++------- ffi/kernel.hpp | 22 ++++++++--- ffi/src/lib.rs | 99 +++++++++++++++++++++++++------------------------- 3 files changed, 89 insertions(+), 70 deletions(-) diff --git a/ffi/kernel.h b/ffi/kernel.h index 29d258a52..e2d7b3371 100644 --- a/ffi/kernel.h +++ b/ffi/kernel.h @@ -7,9 +7,13 @@ typedef enum KernelError { UnknownError, FFIError, ArrowError, + EngineDataTypeError, + ExtractError, GenericError, + IOErrorError, ParquetError, ObjectStoreError, + ObjectStorePathError, FileNotFoundError, MissingColumnError, UnexpectedColumnTypeError, @@ -19,9 +23,15 @@ typedef enum KernelError { InvalidUrlError, MalformedJsonError, MissingMetadataError, + MissingProtocolError, + MissingMetadataAndProtocolError, + ParseError, + JoinFailureError, + Utf8Error, + ParseIntError, } KernelError; -typedef struct ExternTableClientHandle ExternTableClientHandle; +typedef struct ExternEngineInterfaceHandle ExternEngineInterfaceHandle; typedef struct KernelExpressionVisitorState KernelExpressionVisitorState; @@ -55,22 +65,22 @@ typedef struct EngineError { enum KernelError etype; } EngineError; -typedef enum ExternResult______ExternTableClientHandle_Tag { - Ok______ExternTableClientHandle, - Err______ExternTableClientHandle, -} ExternResult______ExternTableClientHandle_Tag; +typedef enum ExternResult______ExternEngineInterfaceHandle_Tag { + Ok______ExternEngineInterfaceHandle, + Err______ExternEngineInterfaceHandle, +} ExternResult______ExternEngineInterfaceHandle_Tag; -typedef struct ExternResult______ExternTableClientHandle { - ExternResult______ExternTableClientHandle_Tag tag; +typedef struct ExternResult______ExternEngineInterfaceHandle { + ExternResult______ExternEngineInterfaceHandle_Tag tag; union { struct { - const struct ExternTableClientHandle *ok; + const struct ExternEngineInterfaceHandle *ok; }; struct { struct EngineError *err; }; }; -} ExternResult______ExternTableClientHandle; +} ExternResult______ExternEngineInterfaceHandle; /** * A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The @@ -194,15 +204,15 @@ void iterate(struct EngineIterator *it); * * Caller is responsible to pass a valid path pointer. */ -struct ExternResult______ExternTableClientHandle get_default_client(struct KernelStringSlice path, - AllocateErrorFn allocate_error); +struct ExternResult______ExternEngineInterfaceHandle get_default_client(struct KernelStringSlice path, + AllocateErrorFn allocate_error); /** * # Safety * * Caller is responsible to pass a valid handle. */ -void drop_table_client(const struct ExternTableClientHandle *table_client); +void drop_table_client(const struct ExternEngineInterfaceHandle *table_client); /** * Get the latest snapshot from the specified table @@ -212,7 +222,7 @@ void drop_table_client(const struct ExternTableClientHandle *table_client); * Caller is responsible to pass valid handles and path pointer. */ struct ExternResult______SnapshotHandle snapshot(struct KernelStringSlice path, - const struct ExternTableClientHandle *table_client); + const struct ExternEngineInterfaceHandle *table_client); /** * # Safety @@ -275,7 +285,7 @@ uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *sta * Caller is responsible to pass a valid snapshot pointer. */ struct ExternResult_____KernelScanFileIterator kernel_scan_files_init(const struct SnapshotHandle *snapshot, - const struct ExternTableClientHandle *table_client, + const struct ExternEngineInterfaceHandle *table_client, struct EnginePredicate *predicate); /** diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp index c25bf5126..7e229a099 100644 --- a/ffi/kernel.hpp +++ b/ffi/kernel.hpp @@ -8,9 +8,13 @@ enum class KernelError { UnknownError, FFIError, ArrowError, + EngineDataTypeError, + ExtractError, GenericError, + IOErrorError, ParquetError, ObjectStoreError, + ObjectStorePathError, FileNotFoundError, MissingColumnError, UnexpectedColumnTypeError, @@ -20,9 +24,15 @@ enum class KernelError { InvalidUrlError, MalformedJsonError, MissingMetadataError, + MissingProtocolError, + MissingMetadataAndProtocolError, + ParseError, + JoinFailureError, + Utf8Error, + ParseIntError, }; -struct ExternTableClientHandle; +struct ExternEngineInterfaceHandle; struct KernelExpressionVisitorState; @@ -122,13 +132,13 @@ void iterate(EngineIterator *it); /// # Safety /// /// Caller is responsible to pass a valid path pointer. -ExternResult get_default_client(KernelStringSlice path, - AllocateErrorFn allocate_error); +ExternResult get_default_client(KernelStringSlice path, + AllocateErrorFn allocate_error); /// # Safety /// /// Caller is responsible to pass a valid handle. -void drop_table_client(const ExternTableClientHandle *table_client); +void drop_table_client(const ExternEngineInterfaceHandle *table_client); /// Get the latest snapshot from the specified table /// @@ -136,7 +146,7 @@ void drop_table_client(const ExternTableClientHandle *table_client); /// /// Caller is responsible to pass valid handles and path pointer. ExternResult snapshot(KernelStringSlice path, - const ExternTableClientHandle *table_client); + const ExternEngineInterfaceHandle *table_client); /// # Safety /// @@ -186,7 +196,7 @@ uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int /// /// Caller is responsible to pass a valid snapshot pointer. ExternResult kernel_scan_files_init(const SnapshotHandle *snapshot, - const ExternTableClientHandle *table_client, + const ExternEngineInterfaceHandle *table_client, EnginePredicate *predicate); /// # Safety diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index aa534ed76..ffdc71324 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -3,18 +3,17 @@ /// Exposes that an engine needs to call from C/C++ to interface with kernel use std::collections::HashMap; use std::default::Default; -use std::fmt::{Display, Formatter}; use std::os::raw::{c_char, c_void}; use std::path::PathBuf; use std::sync::Arc; use url::Url; +use deltakernel::actions::Add; use deltakernel::expressions::{BinaryOperator, Expression, Scalar}; use deltakernel::scan::ScanBuilder; use deltakernel::schema::{DataType, PrimitiveType, StructField, StructType}; use deltakernel::snapshot::Snapshot; -use deltakernel::Add; -use deltakernel::{DeltaResult, Error, TableClient}; +use deltakernel::{DeltaResult, EngineInterface, Error}; mod handle; use handle::{ArcHandle, BoxHandle, SizedArcHandle, Unconstructable}; @@ -113,13 +112,18 @@ impl TryFromStringSlice for String { } #[repr(C)] +#[derive(Debug)] pub enum KernelError { UnknownError, // catch-all for unrecognized kernel Error types FFIError, // errors encountered in the code layer that supports FFI ArrowError, + EngineDataTypeError, + ExtractError, GenericError, + IOErrorError, ParquetError, ObjectStoreError, + ObjectStorePathError, FileNotFoundError, MissingColumnError, UnexpectedColumnTypeError, @@ -129,28 +133,12 @@ pub enum KernelError { InvalidUrlError, MalformedJsonError, MissingMetadataError, -} - -impl Display for KernelError { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - KernelError::UnknownError => write!(f, "UnknownError"), - KernelError::FFIError => write!(f, "FFIError"), - KernelError::ArrowError => write!(f, "ArrowError"), - KernelError::GenericError => write!(f, "GenericError"), - KernelError::ParquetError => write!(f, "ParquetError"), - KernelError::ObjectStoreError => write!(f, "ObjectStoreError"), - KernelError::FileNotFoundError => write!(f, "FileNotFoundError"), - KernelError::MissingColumnError => write!(f, "MissingColumnError"), - KernelError::UnexpectedColumnTypeError => write!(f, "UnexpectedColumnTypeError"), - KernelError::MissingDataError => write!(f, "MissingDataError"), - KernelError::MissingVersionError => write!(f, "MissingVersionError"), - KernelError::DeletionVectorError => write!(f, "DeletionVectorError"), - KernelError::InvalidUrlError => write!(f, "InvalidUrlError"), - KernelError::MalformedJsonError => write!(f, "MalformedJsonError"), - KernelError::MissingMetadataError => write!(f, "MissingMetadataError"), - } - } + MissingProtocolError, + MissingMetadataAndProtocolError, + ParseError, + JoinFailureError, + Utf8Error, + ParseIntError, } impl From for KernelError { @@ -158,10 +146,14 @@ impl From for KernelError { match e { // NOTE: By definition, no kernel Error maps to FFIError Error::Arrow(_) => KernelError::ArrowError, + Error::EngineDataType(_) => KernelError::EngineDataTypeError, + Error::Extract(..) => KernelError::ExtractError, Error::Generic(_) => KernelError::GenericError, Error::GenericError { .. } => KernelError::GenericError, + Error::IOError(_) => KernelError::IOErrorError, Error::Parquet(_) => KernelError::ParquetError, Error::ObjectStore(_) => KernelError::ObjectStoreError, + Error::ObjectStorePath(_) => KernelError::ObjectStorePathError, Error::FileNotFound(_) => KernelError::FileNotFoundError, Error::MissingColumn(_) => KernelError::MissingColumnError, Error::UnexpectedColumnType(_) => KernelError::UnexpectedColumnTypeError, @@ -171,6 +163,12 @@ impl From for KernelError { Error::InvalidUrl(_) => KernelError::InvalidUrlError, Error::MalformedJson(_) => KernelError::MalformedJsonError, Error::MissingMetadata => KernelError::MissingMetadataError, + Error::MissingProtocol => KernelError::MissingProtocolError, + Error::MissingMetadataAndProtocol => KernelError::MissingMetadataAndProtocolError, + Error::ParseError(..) => KernelError::ParseError, + Error::JoinFailure(_) => KernelError::JoinFailureError, + Error::Utf8Error(_) => KernelError::Utf8Error, + Error::ParseIntError(_) => KernelError::ParseIntError, } } } @@ -244,7 +242,7 @@ impl AllocateError for AllocateErrorFn { self(etype, msg) } } -impl AllocateError for *const ExternTableClientHandle { +impl AllocateError for *const ExternEngineInterfaceHandle { /// # Safety /// /// In addition to the usual requirements, the table client handle must be valid. @@ -281,23 +279,23 @@ impl IntoExternResult for DeltaResult { } } -// A wrapper for TableClient which defines additional FFI-specific methods. -pub trait ExternTableClient { - fn table_client(&self) -> Arc; +// A wrapper for EngineInterface which defines additional FFI-specific methods. +pub trait ExternEngineInterface { + fn table_client(&self) -> Arc; fn error_allocator(&self) -> &dyn AllocateError; } -pub struct ExternTableClientHandle { +pub struct ExternEngineInterfaceHandle { _unconstructable: Unconstructable, } -impl ArcHandle for ExternTableClientHandle { - type Target = dyn ExternTableClient; +impl ArcHandle for ExternEngineInterfaceHandle { + type Target = dyn ExternEngineInterface; } -struct ExternTableClientVtable { +struct ExternEngineInterfaceVtable { // Actual table client instance to use - client: Arc, + client: Arc, allocate_error: AllocateErrorFn, } @@ -305,16 +303,16 @@ struct ExternTableClientVtable { /// /// Kernel doesn't use any threading or concurrency. If engine chooses to do so, engine is /// responsible to handle any races that could result. -unsafe impl Send for ExternTableClientVtable {} +unsafe impl Send for ExternEngineInterfaceVtable {} /// # Safety /// /// Kernel doesn't use any threading or concurrency. If engine chooses to do so, engine is /// responsible to handle any races that could result. -unsafe impl Sync for ExternTableClientVtable {} +unsafe impl Sync for ExternEngineInterfaceVtable {} -impl ExternTableClient for ExternTableClientVtable { - fn table_client(&self) -> Arc { +impl ExternEngineInterface for ExternEngineInterfaceVtable { + fn table_client(&self) -> Arc { self.client.clone() } fn error_allocator(&self) -> &dyn AllocateError { @@ -338,23 +336,24 @@ unsafe fn unwrap_and_parse_path_as_url(path: KernelStringSlice) -> DeltaResult ExternResult<*const ExternTableClientHandle> { +) -> ExternResult<*const ExternEngineInterfaceHandle> { get_default_client_impl(path, allocate_error).into_extern_result(allocate_error) } unsafe fn get_default_client_impl( path: KernelStringSlice, allocate_error: AllocateErrorFn, -) -> DeltaResult<*const ExternTableClientHandle> { +) -> DeltaResult<*const ExternEngineInterfaceHandle> { let url = unsafe { unwrap_and_parse_path_as_url(path) }?; - use deltakernel::client::executor::tokio::TokioBackgroundExecutor; - let table_client = deltakernel::DefaultTableClient::::try_new( + use deltakernel::client::default::executor::tokio::TokioBackgroundExecutor; + use deltakernel::client::default::DefaultEngineInterface; + let table_client = DefaultEngineInterface::::try_new( &url, HashMap::::new(), Arc::new(TokioBackgroundExecutor::new()), ); let client = Arc::new(table_client.map_err(Error::generic)?); - let client: Arc = Arc::new(ExternTableClientVtable { + let client: Arc = Arc::new(ExternEngineInterfaceVtable { client, allocate_error, }); @@ -365,7 +364,7 @@ unsafe fn get_default_client_impl( /// /// Caller is responsible to pass a valid handle. #[no_mangle] -pub unsafe extern "C" fn drop_table_client(table_client: *const ExternTableClientHandle) { +pub unsafe extern "C" fn drop_table_client(table_client: *const ExternEngineInterfaceHandle) { ArcHandle::drop_handle(table_client); } @@ -385,14 +384,14 @@ impl SizedArcHandle for SnapshotHandle { #[no_mangle] pub unsafe extern "C" fn snapshot( path: KernelStringSlice, - table_client: *const ExternTableClientHandle, + table_client: *const ExternEngineInterfaceHandle, ) -> ExternResult<*const SnapshotHandle> { snapshot_impl(path, table_client).into_extern_result(table_client) } unsafe fn snapshot_impl( path: KernelStringSlice, - extern_table_client: *const ExternTableClientHandle, + extern_table_client: *const ExternEngineInterfaceHandle, ) -> DeltaResult<*const SnapshotHandle> { let url = unsafe { unwrap_and_parse_path_as_url(path) }?; let extern_table_client = unsafe { ArcHandle::clone_as_arc(extern_table_client) }; @@ -454,7 +453,7 @@ pub unsafe extern "C" fn visit_schema( // Visit all the fields of a struct and return the list of children fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, s.fields.len()); - for field in s.fields.iter() { + for field in s.fields() { visit_field(visitor, child_list_id, field); } child_list_id @@ -702,7 +701,7 @@ pub struct KernelScanFileIterator { files: Box>>, // Also keep a reference to the external client for its error allocator. - table_client: Arc, + table_client: Arc, } impl BoxHandle for KernelScanFileIterator {} @@ -720,7 +719,7 @@ impl Drop for KernelScanFileIterator { #[no_mangle] pub unsafe extern "C" fn kernel_scan_files_init( snapshot: *const SnapshotHandle, - table_client: *const ExternTableClientHandle, + table_client: *const ExternEngineInterfaceHandle, predicate: Option<&mut EnginePredicate>, ) -> ExternResult<*mut KernelScanFileIterator> { kernel_scan_files_init_impl(snapshot, table_client, predicate).into_extern_result(table_client) @@ -728,7 +727,7 @@ pub unsafe extern "C" fn kernel_scan_files_init( fn kernel_scan_files_init_impl( snapshot: *const SnapshotHandle, - extern_table_client: *const ExternTableClientHandle, + extern_table_client: *const ExternEngineInterfaceHandle, predicate: Option<&mut EnginePredicate>, ) -> DeltaResult<*mut KernelScanFileIterator> { let snapshot = unsafe { ArcHandle::clone_as_arc(snapshot) }; From 12304ef736bc8d43dae763664aff94f27a5fef8c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 26 Mar 2024 07:59:58 -0700 Subject: [PATCH 13/27] remove generated header files --- ffi/kernel.h | 308 ------------------------------------------------- ffi/kernel.hpp | 217 ---------------------------------- 2 files changed, 525 deletions(-) delete mode 100644 ffi/kernel.h delete mode 100644 ffi/kernel.hpp diff --git a/ffi/kernel.h b/ffi/kernel.h deleted file mode 100644 index e2d7b3371..000000000 --- a/ffi/kernel.h +++ /dev/null @@ -1,308 +0,0 @@ -#include -#include -#include -#include - -typedef enum KernelError { - UnknownError, - FFIError, - ArrowError, - EngineDataTypeError, - ExtractError, - GenericError, - IOErrorError, - ParquetError, - ObjectStoreError, - ObjectStorePathError, - FileNotFoundError, - MissingColumnError, - UnexpectedColumnTypeError, - MissingDataError, - MissingVersionError, - DeletionVectorError, - InvalidUrlError, - MalformedJsonError, - MissingMetadataError, - MissingProtocolError, - MissingMetadataAndProtocolError, - ParseError, - JoinFailureError, - Utf8Error, - ParseIntError, -} KernelError; - -typedef struct ExternEngineInterfaceHandle ExternEngineInterfaceHandle; - -typedef struct KernelExpressionVisitorState KernelExpressionVisitorState; - -typedef struct KernelScanFileIterator KernelScanFileIterator; - -typedef struct SnapshotHandle SnapshotHandle; - -/** - * Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap - * the engine functions. The engine retains ownership of the iterator. - */ -typedef struct EngineIterator { - void *data; - /** - * A function that should advance the iterator and return the next time from the data - * If the iterator is complete, it should return null. It should be safe to - * call `get_next()` multiple times if it is null. - */ - const void *(*get_next)(void *data); -} EngineIterator; - -/** - * An error that can be returned to the engine. Engines that wish to associate additional - * information can define and use any type that is [pointer - * interconvertible](https://en.cppreference.com/w/cpp/language/static_cast#pointer-interconvertible) - * with this one -- e.g. by subclassing this struct or by embedding this struct as the first member - * of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) - * class. - */ -typedef struct EngineError { - enum KernelError etype; -} EngineError; - -typedef enum ExternResult______ExternEngineInterfaceHandle_Tag { - Ok______ExternEngineInterfaceHandle, - Err______ExternEngineInterfaceHandle, -} ExternResult______ExternEngineInterfaceHandle_Tag; - -typedef struct ExternResult______ExternEngineInterfaceHandle { - ExternResult______ExternEngineInterfaceHandle_Tag tag; - union { - struct { - const struct ExternEngineInterfaceHandle *ok; - }; - struct { - struct EngineError *err; - }; - }; -} ExternResult______ExternEngineInterfaceHandle; - -/** - * A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The - * slice is only valid until the function it was passed into returns, and should not be copied. - * - * # Safety - * - * Intentionally not Copy, Clone, Send, nor Sync. - * - * Whoever instantiates the struct must ensure it does not outlive the data it points to. The - * compiler cannot help us here, because raw pointers don't have lifetimes. To reduce the risk of - * accidental misuse, it is recommended to only instantiate this struct as a function arg, by - * converting a `&str` value `Into`, so the borrowed reference protects the - * function call (callee must not retain any references to the slice after the call returns): - * - * ``` - * fn wants_slice(slice: KernelStringSlice) { ... } - * let msg = String::from(...); - * wants_slice(msg.as_ref().into()); - * ``` - */ -typedef struct KernelStringSlice { - const char *ptr; - uintptr_t len; -} KernelStringSlice; - -typedef struct EngineError *(*AllocateErrorFn)(enum KernelError etype, struct KernelStringSlice msg); - -typedef enum ExternResult______SnapshotHandle_Tag { - Ok______SnapshotHandle, - Err______SnapshotHandle, -} ExternResult______SnapshotHandle_Tag; - -typedef struct ExternResult______SnapshotHandle { - ExternResult______SnapshotHandle_Tag tag; - union { - struct { - const struct SnapshotHandle *ok; - }; - struct { - struct EngineError *err; - }; - }; -} ExternResult______SnapshotHandle; - -typedef struct EngineSchemaVisitor { - void *data; - uintptr_t (*make_field_list)(void *data, uintptr_t reserve); - void (*visit_struct)(void *data, - uintptr_t sibling_list_id, - struct KernelStringSlice name, - uintptr_t child_list_id); - void (*visit_string)(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name); - void (*visit_integer)(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name); - void (*visit_long)(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name); -} EngineSchemaVisitor; - -typedef enum ExternResult_usize_Tag { - Ok_usize, - Err_usize, -} ExternResult_usize_Tag; - -typedef struct ExternResult_usize { - ExternResult_usize_Tag tag; - union { - struct { - uintptr_t ok; - }; - struct { - struct EngineError *err; - }; - }; -} ExternResult_usize; - -typedef enum ExternResult_____KernelScanFileIterator_Tag { - Ok_____KernelScanFileIterator, - Err_____KernelScanFileIterator, -} ExternResult_____KernelScanFileIterator_Tag; - -typedef struct ExternResult_____KernelScanFileIterator { - ExternResult_____KernelScanFileIterator_Tag tag; - union { - struct { - struct KernelScanFileIterator *ok; - }; - struct { - struct EngineError *err; - }; - }; -} ExternResult_____KernelScanFileIterator; - -typedef struct EnginePredicate { - void *predicate; - uintptr_t (*visitor)(void *predicate, struct KernelExpressionVisitorState *state); -} EnginePredicate; - -typedef enum ExternResult_bool_Tag { - Ok_bool, - Err_bool, -} ExternResult_bool_Tag; - -typedef struct ExternResult_bool { - ExternResult_bool_Tag tag; - union { - struct { - bool ok; - }; - struct { - struct EngineError *err; - }; - }; -} ExternResult_bool; - -/** - * test function to print for items. this assumes each item is an `int` - */ -void iterate(struct EngineIterator *it); - -/** - * # Safety - * - * Caller is responsible to pass a valid path pointer. - */ -struct ExternResult______ExternEngineInterfaceHandle get_default_client(struct KernelStringSlice path, - AllocateErrorFn allocate_error); - -/** - * # Safety - * - * Caller is responsible to pass a valid handle. - */ -void drop_table_client(const struct ExternEngineInterfaceHandle *table_client); - -/** - * Get the latest snapshot from the specified table - * - * # Safety - * - * Caller is responsible to pass valid handles and path pointer. - */ -struct ExternResult______SnapshotHandle snapshot(struct KernelStringSlice path, - const struct ExternEngineInterfaceHandle *table_client); - -/** - * # Safety - * - * Caller is responsible to pass a valid handle. - */ -void drop_snapshot(const struct SnapshotHandle *snapshot); - -/** - * Get the version of the specified snapshot - * - * # Safety - * - * Caller is responsible to pass a valid handle. - */ -uint64_t version(const struct SnapshotHandle *snapshot); - -/** - * # Safety - * - * Caller is responsible to pass a valid handle. - */ -uintptr_t visit_schema(const struct SnapshotHandle *snapshot, struct EngineSchemaVisitor *visitor); - -uintptr_t visit_expression_and(struct KernelExpressionVisitorState *state, - struct EngineIterator *children); - -uintptr_t visit_expression_lt(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_le(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_gt(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_ge(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_eq(struct KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -/** - * # Safety - * The string slice must be valid - */ -struct ExternResult_usize visit_expression_column(struct KernelExpressionVisitorState *state, - struct KernelStringSlice name, - AllocateErrorFn allocate_error); - -/** - * # Safety - * The string slice must be valid - */ -struct ExternResult_usize visit_expression_literal_string(struct KernelExpressionVisitorState *state, - struct KernelStringSlice value, - AllocateErrorFn allocate_error); - -uintptr_t visit_expression_literal_long(struct KernelExpressionVisitorState *state, int64_t value); - -/** - * Get a FileList for all the files that need to be read from the table. - * # Safety - * - * Caller is responsible to pass a valid snapshot pointer. - */ -struct ExternResult_____KernelScanFileIterator kernel_scan_files_init(const struct SnapshotHandle *snapshot, - const struct ExternEngineInterfaceHandle *table_client, - struct EnginePredicate *predicate); - -/** - * # Safety - * - * The iterator must be valid (returned by [kernel_scan_files_init]) and not yet freed by - * [kernel_scan_files_free]. The visitor function pointer must be non-null. - */ -struct ExternResult_bool kernel_scan_files_next(struct KernelScanFileIterator *files, - void *engine_context, - void (*engine_visitor)(void *engine_context, - struct KernelStringSlice file_name)); - -/** - * # Safety - * - * Caller is responsible to (at most once) pass a valid pointer returned by a call to - * [kernel_scan_files_init]. - */ -void kernel_scan_files_free(struct KernelScanFileIterator *files); diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp deleted file mode 100644 index 7e229a099..000000000 --- a/ffi/kernel.hpp +++ /dev/null @@ -1,217 +0,0 @@ -#include -#include -#include -#include -#include - -enum class KernelError { - UnknownError, - FFIError, - ArrowError, - EngineDataTypeError, - ExtractError, - GenericError, - IOErrorError, - ParquetError, - ObjectStoreError, - ObjectStorePathError, - FileNotFoundError, - MissingColumnError, - UnexpectedColumnTypeError, - MissingDataError, - MissingVersionError, - DeletionVectorError, - InvalidUrlError, - MalformedJsonError, - MissingMetadataError, - MissingProtocolError, - MissingMetadataAndProtocolError, - ParseError, - JoinFailureError, - Utf8Error, - ParseIntError, -}; - -struct ExternEngineInterfaceHandle; - -struct KernelExpressionVisitorState; - -struct KernelScanFileIterator; - -struct SnapshotHandle; - -/// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap -/// the engine functions. The engine retains ownership of the iterator. -struct EngineIterator { - void *data; - /// A function that should advance the iterator and return the next time from the data - /// If the iterator is complete, it should return null. It should be safe to - /// call `get_next()` multiple times if it is null. - const void *(*get_next)(void *data); -}; - -/// An error that can be returned to the engine. Engines that wish to associate additional -/// information can define and use any type that is [pointer -/// interconvertible](https://en.cppreference.com/w/cpp/language/static_cast#pointer-interconvertible) -/// with this one -- e.g. by subclassing this struct or by embedding this struct as the first member -/// of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) -/// class. -struct EngineError { - KernelError etype; -}; - -template -struct ExternResult { - enum class Tag { - Ok, - Err, - }; - - struct Ok_Body { - T _0; - }; - - struct Err_Body { - EngineError *_0; - }; - - Tag tag; - union { - Ok_Body ok; - Err_Body err; - }; -}; - -/// A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The -/// slice is only valid until the function it was passed into returns, and should not be copied. -/// -/// # Safety -/// -/// Intentionally not Copy, Clone, Send, nor Sync. -/// -/// Whoever instantiates the struct must ensure it does not outlive the data it points to. The -/// compiler cannot help us here, because raw pointers don't have lifetimes. To reduce the risk of -/// accidental misuse, it is recommended to only instantiate this struct as a function arg, by -/// converting a `&str` value `Into`, so the borrowed reference protects the -/// function call (callee must not retain any references to the slice after the call returns): -/// -/// ``` -/// fn wants_slice(slice: KernelStringSlice) { ... } -/// let msg = String::from(...); -/// wants_slice(msg.as_ref().into()); -/// ``` -struct KernelStringSlice { - const char *ptr; - uintptr_t len; -}; - -using AllocateErrorFn = EngineError*(*)(KernelError etype, KernelStringSlice msg); - -struct EngineSchemaVisitor { - void *data; - uintptr_t (*make_field_list)(void *data, uintptr_t reserve); - void (*visit_struct)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uintptr_t child_list_id); - void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); -}; - -struct EnginePredicate { - void *predicate; - uintptr_t (*visitor)(void *predicate, KernelExpressionVisitorState *state); -}; - -extern "C" { - -/// test function to print for items. this assumes each item is an `int` -void iterate(EngineIterator *it); - -/// # Safety -/// -/// Caller is responsible to pass a valid path pointer. -ExternResult get_default_client(KernelStringSlice path, - AllocateErrorFn allocate_error); - -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -void drop_table_client(const ExternEngineInterfaceHandle *table_client); - -/// Get the latest snapshot from the specified table -/// -/// # Safety -/// -/// Caller is responsible to pass valid handles and path pointer. -ExternResult snapshot(KernelStringSlice path, - const ExternEngineInterfaceHandle *table_client); - -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -void drop_snapshot(const SnapshotHandle *snapshot); - -/// Get the version of the specified snapshot -/// -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -uint64_t version(const SnapshotHandle *snapshot); - -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -uintptr_t visit_schema(const SnapshotHandle *snapshot, EngineSchemaVisitor *visitor); - -uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); - -uintptr_t visit_expression_lt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_le(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_gt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_ge(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_eq(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -/// # Safety -/// The string slice must be valid -ExternResult visit_expression_column(KernelExpressionVisitorState *state, - KernelStringSlice name, - AllocateErrorFn allocate_error); - -/// # Safety -/// The string slice must be valid -ExternResult visit_expression_literal_string(KernelExpressionVisitorState *state, - KernelStringSlice value, - AllocateErrorFn allocate_error); - -uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int64_t value); - -/// Get a FileList for all the files that need to be read from the table. -/// # Safety -/// -/// Caller is responsible to pass a valid snapshot pointer. -ExternResult kernel_scan_files_init(const SnapshotHandle *snapshot, - const ExternEngineInterfaceHandle *table_client, - EnginePredicate *predicate); - -/// # Safety -/// -/// The iterator must be valid (returned by [kernel_scan_files_init]) and not yet freed by -/// [kernel_scan_files_free]. The visitor function pointer must be non-null. -ExternResult kernel_scan_files_next(KernelScanFileIterator *files, - void *engine_context, - void (*engine_visitor)(void *engine_context, - KernelStringSlice file_name)); - -/// # Safety -/// -/// Caller is responsible to (at most once) pass a valid pointer returned by a call to -/// [kernel_scan_files_init]. -void kernel_scan_files_free(KernelScanFileIterator *files); - -} // extern "C" From fbe16b1629ea342a186f07c39cae606cf41cce56 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 09:12:09 -0700 Subject: [PATCH 14/27] new feature-flag in FFI crate 'default-client' that enables kernel default client and FFI wrappers for it --- ffi/Cargo.toml | 2 ++ ffi/src/lib.rs | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index c6eb62861..724df11ff 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -20,3 +20,5 @@ deltakernel = { path = "../kernel", features = ["developer-visibility", "tokio"] cbindgen = "0.26.0" libc = "0.2.147" +[features] +default-client = ["deltakernel/default-client"] diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index ffdc71324..ccdf47cd5 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -1,6 +1,7 @@ /// FFI interface for the delta kernel /// /// Exposes that an engine needs to call from C/C++ to interface with kernel +#[cfg(feature = "default-client")] use std::collections::HashMap; use std::default::Default; use std::os::raw::{c_char, c_void}; @@ -122,7 +123,9 @@ pub enum KernelError { GenericError, IOErrorError, ParquetError, + #[cfg(feature = "default-client")] ObjectStoreError, + #[cfg(feature = "default-client")] ObjectStorePathError, FileNotFoundError, MissingColumnError, @@ -152,7 +155,9 @@ impl From for KernelError { Error::GenericError { .. } => KernelError::GenericError, Error::IOError(_) => KernelError::IOErrorError, Error::Parquet(_) => KernelError::ParquetError, + #[cfg(feature = "default-client")] Error::ObjectStore(_) => KernelError::ObjectStoreError, + #[cfg(feature = "default-client")] Error::ObjectStorePath(_) => KernelError::ObjectStorePathError, Error::FileNotFound(_) => KernelError::FileNotFoundError, Error::MissingColumn(_) => KernelError::MissingColumnError, @@ -332,6 +337,7 @@ unsafe fn unwrap_and_parse_path_as_url(path: KernelStringSlice) -> DeltaResult Date: Wed, 3 Apr 2024 09:51:48 -0700 Subject: [PATCH 15/27] cleanup and comments --- ffi/Cargo.toml | 1 + ffi/src/lib.rs | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index 724df11ff..b9c547e43 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -13,6 +13,7 @@ build = "build.rs" crate-type = ["cdylib", "staticlib"] [dependencies] +tracing = "0.1" url = "2" deltakernel = { path = "../kernel", features = ["developer-visibility", "tokio"] } diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index ccdf47cd5..5fbd0b4dd 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -7,6 +7,7 @@ use std::default::Default; use std::os::raw::{c_char, c_void}; use std::path::PathBuf; use std::sync::Arc; +use tracing::debug; use url::Url; use deltakernel::actions::Add; @@ -189,6 +190,8 @@ pub struct EngineError { etype: KernelError, } +/// Semantics: Kernel will always immediately return the leaked engine error to the engine (if it +/// allocated one at all), and engine is responsible to free it. #[repr(C)] pub enum ExternResult { Ok(T), @@ -298,6 +301,7 @@ impl ArcHandle for ExternEngineInterfaceHandle { type Target = dyn ExternEngineInterface; } +// TODO missing a drop method struct ExternEngineInterfaceVtable { // Actual table client instance to use client: Arc, @@ -314,6 +318,11 @@ unsafe impl Send for ExternEngineInterfaceVtable {} /// /// Kernel doesn't use any threading or concurrency. If engine chooses to do so, engine is /// responsible to handle any races that could result. +/// +/// These are needed because anything wrapped in Arc "should" implement it +/// Basically, by failing to implement these traits, we forbid the engine from being able to declare +/// its thread-safety (because rust assumes it is not threadsafe). By implementing them, we leave it +/// up to the engine to enforce thread safety if engine chooses to use threads at all. unsafe impl Sync for ExternEngineInterfaceVtable {} impl ExternEngineInterface for ExternEngineInterfaceVtable { @@ -439,13 +448,13 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, child_list_id: usize, - ) -> (), + ), visit_string: - extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice) -> (), + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), visit_integer: - extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice) -> (), + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), visit_long: - extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice) -> (), + extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), } /// # Safety @@ -536,6 +545,7 @@ impl Default for ReferenceSet { fn default() -> Self { Self { map: Default::default(), + // NOTE: 0 is interpreted as None next_id: 1, } } @@ -580,6 +590,7 @@ fn unwrap_kernel_expression( state.inflight_expressions.take(exprid) } +// TODO move visitors to separate module fn visit_expression_binary( state: &mut KernelExpressionVisitorState, op: BinaryOperator, @@ -708,6 +719,8 @@ pub struct KernelScanFileIterator { files: Box>>, // Also keep a reference to the external client for its error allocator. + // Parquet and Json handlers don't hold any reference to the tokio reactor, so if the last table + // client goes out of scope the iterator terminates early. table_client: Arc, } @@ -715,7 +728,7 @@ impl BoxHandle for KernelScanFileIterator {} impl Drop for KernelScanFileIterator { fn drop(&mut self) { - println!("dropping KernelScanFileIterator"); + debug!("dropping KernelScanFileIterator"); } } @@ -795,6 +808,8 @@ fn kernel_scan_files_next_impl( /// /// Caller is responsible to (at most once) pass a valid pointer returned by a call to /// [kernel_scan_files_init]. +// we should probably be consistent with drop vs. free on engine side (probably the latter is more +// intuitive to non-rust code) #[no_mangle] pub unsafe extern "C" fn kernel_scan_files_free(files: *mut KernelScanFileIterator) { BoxHandle::drop_handle(files); From 6412bbb8cf6594f571208689ada7b048c56b3108 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 09:53:06 -0700 Subject: [PATCH 16/27] fmt --- ffi/src/lib.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 5fbd0b4dd..e6229a7fd 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -449,12 +449,10 @@ pub struct EngineSchemaVisitor { name: KernelStringSlice, child_list_id: usize, ), - visit_string: - extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), + visit_string: extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), visit_integer: extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), - visit_long: - extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), + visit_long: extern "C" fn(data: *mut c_void, sibling_list_id: usize, name: KernelStringSlice), } /// # Safety From ff07d9a1271648e563982fd04837cc9188e2fa64 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 14:20:16 -0700 Subject: [PATCH 17/27] fix DEFINES for cbindgen feature flag --- ffi/cbindgen.toml | 11 +++ ffi/kernel.hpp | 225 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 ffi/kernel.hpp diff --git a/ffi/cbindgen.toml b/ffi/cbindgen.toml index e69de29bb..d467b2685 100644 --- a/ffi/cbindgen.toml +++ b/ffi/cbindgen.toml @@ -0,0 +1,11 @@ +language = "C++" + +# A list of substitutions for converting cfg's to ifdefs. cfgs which aren't +# defined here will just be discarded. +# +# e.g. +# `#[cfg(target = "freebsd")] ...` +# becomes +# `#if defined(DEFINE_FREEBSD) ... #endif` +[defines] +"feature = default-client" = "DEFINE_DEFAULT_CLIENT" diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp new file mode 100644 index 000000000..b10a938fe --- /dev/null +++ b/ffi/kernel.hpp @@ -0,0 +1,225 @@ +#include +#include +#include +#include +#include + +enum class KernelError { + UnknownError, + FFIError, + ArrowError, + EngineDataTypeError, + ExtractError, + GenericError, + IOErrorError, + ParquetError, +#if defined(DEFINE_DEFAULT_CLIENT) + ObjectStoreError, +#endif +#if defined(DEFINE_DEFAULT_CLIENT) + ObjectStorePathError, +#endif + FileNotFoundError, + MissingColumnError, + UnexpectedColumnTypeError, + MissingDataError, + MissingVersionError, + DeletionVectorError, + InvalidUrlError, + MalformedJsonError, + MissingMetadataError, + MissingProtocolError, + MissingMetadataAndProtocolError, + ParseError, + JoinFailureError, + Utf8Error, + ParseIntError, +}; + +struct ExternEngineInterfaceHandle; + +struct KernelExpressionVisitorState; + +struct KernelScanFileIterator; + +struct SnapshotHandle; + +/// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap +/// the engine functions. The engine retains ownership of the iterator. +struct EngineIterator { + void *data; + /// A function that should advance the iterator and return the next time from the data + /// If the iterator is complete, it should return null. It should be safe to + /// call `get_next()` multiple times if it is null. + const void *(*get_next)(void *data); +}; + +/// An error that can be returned to the engine. Engines that wish to associate additional +/// information can define and use any type that is [pointer +/// interconvertible](https://en.cppreference.com/w/cpp/language/static_cast#pointer-interconvertible) +/// with this one -- e.g. by subclassing this struct or by embedding this struct as the first member +/// of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) +/// class. +struct EngineError { + KernelError etype; +}; + +/// Semantics: Kernel will always immediately return the leaked engine error to the engine (if it +/// allocated one at all), and engine is responsible to free it. +template +struct ExternResult { + enum class Tag { + Ok, + Err, + }; + + struct Ok_Body { + T _0; + }; + + struct Err_Body { + EngineError *_0; + }; + + Tag tag; + union { + Ok_Body ok; + Err_Body err; + }; +}; + +/// A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The +/// slice is only valid until the function it was passed into returns, and should not be copied. +/// +/// # Safety +/// +/// Intentionally not Copy, Clone, Send, nor Sync. +/// +/// Whoever instantiates the struct must ensure it does not outlive the data it points to. The +/// compiler cannot help us here, because raw pointers don't have lifetimes. To reduce the risk of +/// accidental misuse, it is recommended to only instantiate this struct as a function arg, by +/// converting a `&str` value `Into`, so the borrowed reference protects the +/// function call (callee must not retain any references to the slice after the call returns): +/// +/// ``` +/// fn wants_slice(slice: KernelStringSlice) { ... } +/// let msg = String::from(...); +/// wants_slice(msg.as_ref().into()); +/// ``` +struct KernelStringSlice { + const char *ptr; + uintptr_t len; +}; + +using AllocateErrorFn = EngineError*(*)(KernelError etype, KernelStringSlice msg); + +struct EngineSchemaVisitor { + void *data; + uintptr_t (*make_field_list)(void *data, uintptr_t reserve); + void (*visit_struct)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + uintptr_t child_list_id); + void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); +}; + +struct EnginePredicate { + void *predicate; + uintptr_t (*visitor)(void *predicate, KernelExpressionVisitorState *state); +}; + +extern "C" { + +/// test function to print for items. this assumes each item is an `int` +void iterate(EngineIterator *it); + +#if defined(DEFINE_DEFAULT_CLIENT) +/// # Safety +/// +/// Caller is responsible to pass a valid path pointer. +ExternResult get_default_client(KernelStringSlice path, + AllocateErrorFn allocate_error); +#endif + +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +void drop_table_client(const ExternEngineInterfaceHandle *table_client); + +/// Get the latest snapshot from the specified table +/// +/// # Safety +/// +/// Caller is responsible to pass valid handles and path pointer. +ExternResult snapshot(KernelStringSlice path, + const ExternEngineInterfaceHandle *table_client); + +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +void drop_snapshot(const SnapshotHandle *snapshot); + +/// Get the version of the specified snapshot +/// +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +uint64_t version(const SnapshotHandle *snapshot); + +/// # Safety +/// +/// Caller is responsible to pass a valid handle. +uintptr_t visit_schema(const SnapshotHandle *snapshot, EngineSchemaVisitor *visitor); + +uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); + +uintptr_t visit_expression_lt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_le(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_gt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_ge(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +uintptr_t visit_expression_eq(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); + +/// # Safety +/// The string slice must be valid +ExternResult visit_expression_column(KernelExpressionVisitorState *state, + KernelStringSlice name, + AllocateErrorFn allocate_error); + +/// # Safety +/// The string slice must be valid +ExternResult visit_expression_literal_string(KernelExpressionVisitorState *state, + KernelStringSlice value, + AllocateErrorFn allocate_error); + +uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int64_t value); + +/// Get a FileList for all the files that need to be read from the table. +/// # Safety +/// +/// Caller is responsible to pass a valid snapshot pointer. +ExternResult kernel_scan_files_init(const SnapshotHandle *snapshot, + const ExternEngineInterfaceHandle *table_client, + EnginePredicate *predicate); + +/// # Safety +/// +/// The iterator must be valid (returned by [kernel_scan_files_init]) and not yet freed by +/// [kernel_scan_files_free]. The visitor function pointer must be non-null. +ExternResult kernel_scan_files_next(KernelScanFileIterator *files, + void *engine_context, + void (*engine_visitor)(void *engine_context, + KernelStringSlice file_name)); + +/// # Safety +/// +/// Caller is responsible to (at most once) pass a valid pointer returned by a call to +/// [kernel_scan_files_init]. +void kernel_scan_files_free(KernelScanFileIterator *files); + +} // extern "C" From 447fdb462be66e86d95fc2bc98da35aaba1f1abc Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 14:42:33 -0700 Subject: [PATCH 18/27] add gitignore headers --- ffi/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 ffi/.gitignore diff --git a/ffi/.gitignore b/ffi/.gitignore new file mode 100644 index 000000000..c699c2295 --- /dev/null +++ b/ffi/.gitignore @@ -0,0 +1,2 @@ +kernel.h +kernel.hpp From 3ce3b153328b309aaf873dba061989ce38081992 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 14:45:41 -0700 Subject: [PATCH 19/27] try_from_slice just unwraps --- ffi/src/lib.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index e6229a7fd..d9657f3e5 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -94,7 +94,7 @@ impl Into for &str { } trait TryFromStringSlice: Sized { - unsafe fn try_from_slice(slice: KernelStringSlice) -> DeltaResult; + unsafe fn try_from_slice(slice: KernelStringSlice) -> Self; } impl TryFromStringSlice for String { @@ -104,12 +104,9 @@ impl TryFromStringSlice for String { /// /// The slice must be a valid (non-null) pointer, and must point to the indicated number of /// valid utf8 bytes. - unsafe fn try_from_slice(slice: KernelStringSlice) -> DeltaResult { + unsafe fn try_from_slice(slice: KernelStringSlice) -> String { let slice = unsafe { std::slice::from_raw_parts(slice.ptr.cast(), slice.len) }; - match std::str::from_utf8(slice) { - Ok(s) => Ok(s.to_string()), - Err(e) => Err(Error::generic_err(e)), - } + std::str::from_utf8(slice).unwrap().to_string() } } @@ -339,7 +336,7 @@ impl ExternEngineInterface for ExternEngineInterfaceVtable { /// Caller is responsible to pass a valid path pointer. unsafe fn unwrap_and_parse_path_as_url(path: KernelStringSlice) -> DeltaResult { let path = unsafe { String::try_from_slice(path) }; - let path = std::fs::canonicalize(PathBuf::from(path?)).map_err(Error::generic)?; + let path = std::fs::canonicalize(PathBuf::from(path)).map_err(Error::generic)?; Url::from_directory_path(path).map_err(|_| Error::generic("Invalid url")) } @@ -677,7 +674,7 @@ unsafe fn visit_expression_column_impl( name: KernelStringSlice, ) -> DeltaResult { let name = unsafe { String::try_from_slice(name) }; - Ok(wrap_expression(state, Expression::Column(name?))) + Ok(wrap_expression(state, Expression::Column(name))) } /// # Safety @@ -697,7 +694,7 @@ unsafe fn visit_expression_literal_string_impl( let value = unsafe { String::try_from_slice(value) }; Ok(wrap_expression( state, - Expression::Literal(Scalar::from(value?)), + Expression::Literal(Scalar::from(value)), )) } From a7b50dd30d52e39215c7c9b47becf6bc69ea17a6 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 14:48:52 -0700 Subject: [PATCH 20/27] remove accidental kernel.hpp commit --- ffi/kernel.hpp | 225 ------------------------------------------------- 1 file changed, 225 deletions(-) delete mode 100644 ffi/kernel.hpp diff --git a/ffi/kernel.hpp b/ffi/kernel.hpp deleted file mode 100644 index b10a938fe..000000000 --- a/ffi/kernel.hpp +++ /dev/null @@ -1,225 +0,0 @@ -#include -#include -#include -#include -#include - -enum class KernelError { - UnknownError, - FFIError, - ArrowError, - EngineDataTypeError, - ExtractError, - GenericError, - IOErrorError, - ParquetError, -#if defined(DEFINE_DEFAULT_CLIENT) - ObjectStoreError, -#endif -#if defined(DEFINE_DEFAULT_CLIENT) - ObjectStorePathError, -#endif - FileNotFoundError, - MissingColumnError, - UnexpectedColumnTypeError, - MissingDataError, - MissingVersionError, - DeletionVectorError, - InvalidUrlError, - MalformedJsonError, - MissingMetadataError, - MissingProtocolError, - MissingMetadataAndProtocolError, - ParseError, - JoinFailureError, - Utf8Error, - ParseIntError, -}; - -struct ExternEngineInterfaceHandle; - -struct KernelExpressionVisitorState; - -struct KernelScanFileIterator; - -struct SnapshotHandle; - -/// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap -/// the engine functions. The engine retains ownership of the iterator. -struct EngineIterator { - void *data; - /// A function that should advance the iterator and return the next time from the data - /// If the iterator is complete, it should return null. It should be safe to - /// call `get_next()` multiple times if it is null. - const void *(*get_next)(void *data); -}; - -/// An error that can be returned to the engine. Engines that wish to associate additional -/// information can define and use any type that is [pointer -/// interconvertible](https://en.cppreference.com/w/cpp/language/static_cast#pointer-interconvertible) -/// with this one -- e.g. by subclassing this struct or by embedding this struct as the first member -/// of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) -/// class. -struct EngineError { - KernelError etype; -}; - -/// Semantics: Kernel will always immediately return the leaked engine error to the engine (if it -/// allocated one at all), and engine is responsible to free it. -template -struct ExternResult { - enum class Tag { - Ok, - Err, - }; - - struct Ok_Body { - T _0; - }; - - struct Err_Body { - EngineError *_0; - }; - - Tag tag; - union { - Ok_Body ok; - Err_Body err; - }; -}; - -/// A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The -/// slice is only valid until the function it was passed into returns, and should not be copied. -/// -/// # Safety -/// -/// Intentionally not Copy, Clone, Send, nor Sync. -/// -/// Whoever instantiates the struct must ensure it does not outlive the data it points to. The -/// compiler cannot help us here, because raw pointers don't have lifetimes. To reduce the risk of -/// accidental misuse, it is recommended to only instantiate this struct as a function arg, by -/// converting a `&str` value `Into`, so the borrowed reference protects the -/// function call (callee must not retain any references to the slice after the call returns): -/// -/// ``` -/// fn wants_slice(slice: KernelStringSlice) { ... } -/// let msg = String::from(...); -/// wants_slice(msg.as_ref().into()); -/// ``` -struct KernelStringSlice { - const char *ptr; - uintptr_t len; -}; - -using AllocateErrorFn = EngineError*(*)(KernelError etype, KernelStringSlice msg); - -struct EngineSchemaVisitor { - void *data; - uintptr_t (*make_field_list)(void *data, uintptr_t reserve); - void (*visit_struct)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uintptr_t child_list_id); - void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); -}; - -struct EnginePredicate { - void *predicate; - uintptr_t (*visitor)(void *predicate, KernelExpressionVisitorState *state); -}; - -extern "C" { - -/// test function to print for items. this assumes each item is an `int` -void iterate(EngineIterator *it); - -#if defined(DEFINE_DEFAULT_CLIENT) -/// # Safety -/// -/// Caller is responsible to pass a valid path pointer. -ExternResult get_default_client(KernelStringSlice path, - AllocateErrorFn allocate_error); -#endif - -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -void drop_table_client(const ExternEngineInterfaceHandle *table_client); - -/// Get the latest snapshot from the specified table -/// -/// # Safety -/// -/// Caller is responsible to pass valid handles and path pointer. -ExternResult snapshot(KernelStringSlice path, - const ExternEngineInterfaceHandle *table_client); - -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -void drop_snapshot(const SnapshotHandle *snapshot); - -/// Get the version of the specified snapshot -/// -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -uint64_t version(const SnapshotHandle *snapshot); - -/// # Safety -/// -/// Caller is responsible to pass a valid handle. -uintptr_t visit_schema(const SnapshotHandle *snapshot, EngineSchemaVisitor *visitor); - -uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); - -uintptr_t visit_expression_lt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_le(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_gt(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_ge(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -uintptr_t visit_expression_eq(KernelExpressionVisitorState *state, uintptr_t a, uintptr_t b); - -/// # Safety -/// The string slice must be valid -ExternResult visit_expression_column(KernelExpressionVisitorState *state, - KernelStringSlice name, - AllocateErrorFn allocate_error); - -/// # Safety -/// The string slice must be valid -ExternResult visit_expression_literal_string(KernelExpressionVisitorState *state, - KernelStringSlice value, - AllocateErrorFn allocate_error); - -uintptr_t visit_expression_literal_long(KernelExpressionVisitorState *state, int64_t value); - -/// Get a FileList for all the files that need to be read from the table. -/// # Safety -/// -/// Caller is responsible to pass a valid snapshot pointer. -ExternResult kernel_scan_files_init(const SnapshotHandle *snapshot, - const ExternEngineInterfaceHandle *table_client, - EnginePredicate *predicate); - -/// # Safety -/// -/// The iterator must be valid (returned by [kernel_scan_files_init]) and not yet freed by -/// [kernel_scan_files_free]. The visitor function pointer must be non-null. -ExternResult kernel_scan_files_next(KernelScanFileIterator *files, - void *engine_context, - void (*engine_visitor)(void *engine_context, - KernelStringSlice file_name)); - -/// # Safety -/// -/// Caller is responsible to (at most once) pass a valid pointer returned by a call to -/// [kernel_scan_files_init]. -void kernel_scan_files_free(KernelScanFileIterator *files); - -} // extern "C" From 245fdf3935df1522e856badaaa1e20bfca31df8a Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 15:03:29 -0700 Subject: [PATCH 21/27] remove comment --- ffi/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index d9657f3e5..cfbf48d16 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -298,7 +298,6 @@ impl ArcHandle for ExternEngineInterfaceHandle { type Target = dyn ExternEngineInterface; } -// TODO missing a drop method struct ExternEngineInterfaceVtable { // Actual table client instance to use client: Arc, From aa2a564c8f79e2daeb8d0bb5fdb914ac4c050a6d Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 15:29:35 -0700 Subject: [PATCH 22/27] fix Backtraced error --- ffi/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index cfbf48d16..bfd048b9d 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -172,6 +172,7 @@ impl From for KernelError { Error::JoinFailure(_) => KernelError::JoinFailureError, Error::Utf8Error(_) => KernelError::Utf8Error, Error::ParseIntError(_) => KernelError::ParseIntError, + Error::Backtraced { source, backtrace: _ } => Self::from(*source), } } } From f947f6a727b74c8b80d1b1273a291a69d762edee Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 15:31:05 -0700 Subject: [PATCH 23/27] fmt --- ffi/src/lib.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index bfd048b9d..e838fbb45 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -172,7 +172,10 @@ impl From for KernelError { Error::JoinFailure(_) => KernelError::JoinFailureError, Error::Utf8Error(_) => KernelError::Utf8Error, Error::ParseIntError(_) => KernelError::ParseIntError, - Error::Backtraced { source, backtrace: _ } => Self::from(*source), + Error::Backtraced { + source, + backtrace: _, + } => Self::from(*source), } } } From f27c0b5a86f6e1dfd7b54c104325b28b7213ae22 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 17:59:14 -0700 Subject: [PATCH 24/27] fix cffi-test --- README.md | 2 +- ffi/.gitignore | 4 +- ffi/Cargo.toml | 3 +- ffi/Makefile | 15 +++++--- ffi/README.md | 12 +++++- ffi/build.rs | 11 ++---- ffi/cbindgen.toml | 11 ------ ffi/cffi-test.c | 93 +++++++++++++++++++++++++---------------------- ffi/src/lib.rs | 2 +- justfile | 9 ++++- kernel/Cargo.toml | 3 ++ 11 files changed, 89 insertions(+), 76 deletions(-) delete mode 100644 ffi/cbindgen.toml diff --git a/README.md b/README.md index 7ef15d8cf..740dd5d79 100644 --- a/README.md +++ b/README.md @@ -79,4 +79,4 @@ Some design principles which should be considered: [rustup]: https://rustup.rs [architecture.md]: https://github.com/delta-incubator/delta-kernel-rs/tree/master/architecture.md [dat]: https://github.com/delta-incubator/dat -[just]: https://github.com/casey/just +[just]: https://github.com/casey/just \ No newline at end of file diff --git a/ffi/.gitignore b/ffi/.gitignore index c699c2295..b65905e63 100644 --- a/ffi/.gitignore +++ b/ffi/.gitignore @@ -1,2 +1,2 @@ -kernel.h -kernel.hpp +cffi-test +cffi-test.o diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index b9c547e43..d5935cc8e 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -15,11 +15,12 @@ crate-type = ["cdylib", "staticlib"] [dependencies] tracing = "0.1" url = "2" -deltakernel = { path = "../kernel", features = ["developer-visibility", "tokio"] } +deltakernel = { path = "../kernel", features = ["developer-visibility"] } [build-dependencies] cbindgen = "0.26.0" libc = "0.2.147" [features] +default = ["default-client"] default-client = ["deltakernel/default-client"] diff --git a/ffi/Makefile b/ffi/Makefile index c04848f20..9cf4fc245 100644 --- a/ffi/Makefile +++ b/ffi/Makefile @@ -1,13 +1,13 @@ BINARY=cffi-test SOURCES=cffi-test.c -INCPATHS=kernel.h +INCPATHS=../target/ffi-headers LIBPATHS=../target/debug -LDFLAGS=-ldeltakernel #`pkg-config --libs arrow-glib` -CFLAGS=-c -Wall #`pkg-config --cflags arrow-glib` +LDFLAGS=-ldeltakernel_ffi #`pkg-config --libs arrow-glib` +CFLAGS=-c -Wall -DDEFINE_DEFAULT_CLIENT #`pkg-config --cflags arrow-glib` CC=gcc OBJECTS=$(SOURCES:.c=.o) -#INCFLAGS=$(foreach TMP,$(INCPATHS),-I$(TMP)) +INCFLAGS=$(foreach TMP,$(INCPATHS),-I$(TMP)) LIBFLAGS=$(foreach TMP,$(LIBPATHS),-L$(TMP)) all: $(SOURCES) $(BINARY) @@ -17,11 +17,14 @@ $(BINARY): $(OBJECTS) .c.o: $(CC) $(INCFLAGS) $(CFLAGS) -fPIC $< -o $@ -run: +run: $(BINARY) LD_LIBRARY_PATH=$(LIBPATHS) ./$(BINARY) $(table) +test: $(BINARY) + LD_LIBRARY_PATH=$(LIBPATHS) ./$(BINARY) ../kernel/tests/data/basic_partitioned + distclean: clean rm -f $(BINARY) clean: - rm -f $(OBJECTS) + rm -f $(OBJECTS) $(BINARY) diff --git a/ffi/README.md b/ffi/README.md index 45ea0ee2d..19704a4d0 100644 --- a/ffi/README.md +++ b/ffi/README.md @@ -6,7 +6,15 @@ This crate provides a c foreign function internface (ffi) for delta-kernel-rs. You can build static and shared-libraries, as well as the include headers by simply running: ```sh -cargo build [--release] +cargo build [--release] [--features default-client] ``` -This will place libraries in the root `target` dir (`../target/[debug,release]` from the directory containing this README), and headers in `../target/ffi-headers`. In that directory there will be a `deltakernel-ffi.h` file, which is the C header, and a `deltakernel-ffi.hpp` which is the C++ header. +to build and run the C program which excercises FFI: + +```sh +table=../kernel/tests/data/table-without-dv-small make run +``` + + + +This will place libraries in the root `target` dir (`../target/[debug,release]` from the directory containing this README), and headers in `../target/ffi-headers`. In that directory there will be a `deltakernel-ffi.h` file, which is the C header, and a `deltakernel-ffi.hpp` which is the C++ header. \ No newline at end of file diff --git a/ffi/build.rs b/ffi/build.rs index 0eeb7dddf..19ff278e0 100644 --- a/ffi/build.rs +++ b/ffi/build.rs @@ -23,13 +23,10 @@ fn main() { let target_dir = get_target_dir(crate_dir.as_str()); // Any `cfgs` we want to turn into ifdefs need to go in here - let defines: HashMap = HashMap::from([ - ( - "feature = default-client".into(), - "DEFINE_DEFAULT_CLIENT".into(), - ), - ("feature = tokio".into(), "DEFINE_TOKIO".into()), - ]); + let defines: HashMap = HashMap::from([( + "feature = default-client".into(), + "DEFINE_DEFAULT_CLIENT".into(), + )]); // generate cxx bindings let output_file_hpp = target_dir diff --git a/ffi/cbindgen.toml b/ffi/cbindgen.toml deleted file mode 100644 index d467b2685..000000000 --- a/ffi/cbindgen.toml +++ /dev/null @@ -1,11 +0,0 @@ -language = "C++" - -# A list of substitutions for converting cfg's to ifdefs. cfgs which aren't -# defined here will just be discarded. -# -# e.g. -# `#[cfg(target = "freebsd")] ...` -# becomes -# `#if defined(DEFINE_FREEBSD) ... #endif` -[defines] -"feature = default-client" = "DEFINE_DEFAULT_CLIENT" diff --git a/ffi/cffi-test.c b/ffi/cffi-test.c index c9460f5f9..edc9ead8d 100644 --- a/ffi/cffi-test.c +++ b/ffi/cffi-test.c @@ -1,62 +1,67 @@ #include +#include -#include "kernel.h" +#include "deltakernel-ffi.h" -/* #include */ -/* #include */ -/* #include "arrow/c/abi.h" */ +void visit_file(void *engine_context, struct KernelStringSlice file_name) { + printf("file: %s\n", file_name.ptr); +} -typedef struct iter_data { - int lim; - int cur; -} iter_data; +int main(int argc, char* argv[]) { -const void* next(void* data) { - iter_data *id = (iter_data*)data; - if (id->cur >= id->lim) { - return 0; - } else { - id->cur++; - return &id->cur; + if (argc < 2) { + printf("Usage: %s table/path\n", argv[0]); + return -1; } -} -void release(void* data) { - printf("released\n"); -} + char* table_path = argv[1]; + printf("Reading table at %s\n", table_path); -void test_iter() { - iter_data it; - it.lim = 10; - it.cur = 0; + KernelStringSlice table_path_slice = {table_path, strlen(table_path)}; - struct EngineIterator eit = { - .data = &it, - .get_next = &next, - }; - iterate(&eit); -} + ExternResult______ExternEngineInterfaceHandle table_client_res = + get_default_client(table_path_slice, NULL); + if (table_client_res.tag != Ok______ExternEngineInterfaceHandle) { + printf("Failed to get client\n"); + return -1; + } -int main(int argc, char* argv[]) { + const ExternEngineInterfaceHandle *table_client = table_client_res.ok; - if (argc < 2) { - printf("Usage: %s [table_path]\n", argv[0]); + ExternResult______SnapshotHandle snapshot_handle_res = snapshot(table_path_slice, table_client); + if (snapshot_handle_res.tag != Ok______SnapshotHandle) { + printf("Failed to create snapshot\n"); return -1; } - char* table_path = argv[1]; - printf("Opening table at %s\n", table_path); - DefaultTable *table = get_table_with_default_client(table_path); - DefaultSnapshot *ss = snapshot(table); - uint64_t v = version(ss); - printf("Got version: %lu\n", v); - - struct FileList fl = get_scan_files(ss, NULL); - printf("Need to read %i files\n", fl.file_count); - for (int i = 0;i < fl.file_count;i++) { - printf("file %i -> %s\n", i, fl.files[i]); + const SnapshotHandle *snapshot_handle = snapshot_handle_res.ok; + + uint64_t v = version(snapshot_handle); + printf("version: %llu\n", v); + + ExternResult_____KernelScanFileIterator file_iter_res = + kernel_scan_files_init(snapshot_handle, table_client, NULL); + if (file_iter_res.tag != Ok_____KernelScanFileIterator) { + printf("Failed to construct scan file iterator\n"); + return -1; } - test_iter(); + KernelScanFileIterator *file_iter = file_iter_res.ok; + + // iterate scan files + for (;;) { + ExternResult_bool ok_res = kernel_scan_files_next(file_iter, NULL, visit_file); + if (ok_res.tag != Ok_bool) { + printf("Failed to iterate scan file\n"); + return -1; + } else if (!ok_res.ok) { + break; + } + } + + kernel_scan_files_free(file_iter); + drop_snapshot(snapshot_handle); + drop_table_client(table_client); + return 0; } diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index e838fbb45..07ccccf82 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -794,7 +794,7 @@ fn kernel_scan_files_next_impl( engine_visitor: extern "C" fn(engine_context: *mut c_void, file_name: KernelStringSlice), ) -> DeltaResult { if let Some(add) = files.files.next().transpose()? { - println!("Got file: {}", add.path); + debug!("Got file: {}", add.path); (engine_visitor)(engine_context, add.path.as_str().into()); Ok(true) } else { diff --git a/justfile b/justfile index b307cc3b1..188430906 100644 --- a/justfile +++ b/justfile @@ -15,4 +15,11 @@ fix: # build and serve the documentation docs: - cardo docs --open + cargo docs --open + +# build and test ffi +ffi: + pushd ffi + cargo b --features default-client + table=../kernel/tests/data/table-without-dv-small make run + popd diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index 60ec550eb..9339fe242 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -8,6 +8,9 @@ repository.workspace = true readme.workspace = true version.workspace = true +# [lib] +# crate-type = ["staticlib"] + [dependencies] bytes = "1.4" chrono = { version = "0.4" } From 7630cb238ae3e50bb5662e6486732975469ce4c5 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 17:59:57 -0700 Subject: [PATCH 25/27] fix --- kernel/Cargo.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index 9339fe242..60ec550eb 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -8,9 +8,6 @@ repository.workspace = true readme.workspace = true version.workspace = true -# [lib] -# crate-type = ["staticlib"] - [dependencies] bytes = "1.4" chrono = { version = "0.4" } From 57dfd28dc7dc7ae99119f7bf4591d877a06282b5 Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Wed, 3 Apr 2024 18:03:49 -0700 Subject: [PATCH 26/27] try github actions? --- .github/workflows/build.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d8d1516ef..9be8ce7ae 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -60,3 +60,16 @@ jobs: - uses: Swatinem/rust-cache@v2 - name: test run: cargo test --verbose --all-features + ffi_test: + runs-on: macOS-latest + steps: + - uses: actions/checkout@v3 + - name: Install minimal stable with clippy and rustfmt + uses: actions-rs/toolchain@v1 + with: + profile: default + toolchain: stable + override: true + - uses: Swatinem/rust-cache@v2 + - name: test + run: pushd ffi && cargo b --features default-client && make test && popd From 78e4082dc167e13d297ec344c8341ab464e17e7a Mon Sep 17 00:00:00 2001 From: Zach Schuermann Date: Thu, 4 Apr 2024 11:46:34 -0700 Subject: [PATCH 27/27] update comment Co-authored-by: Ryan Johnson --- ffi/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 07ccccf82..89216cbbb 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -717,8 +717,8 @@ pub struct KernelScanFileIterator { files: Box>>, // Also keep a reference to the external client for its error allocator. - // Parquet and Json handlers don't hold any reference to the tokio reactor, so if the last table - // client goes out of scope the iterator terminates early. + // Parquet and Json handlers don't hold any reference to the tokio reactor, so the iterator + // terminates early if the last table client goes out of scope. table_client: Arc, }