From 20a569a733f450e463eea2d635d052958a71f750 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 19 Jun 2024 18:43:05 -0700 Subject: [PATCH] fix: Adjust FFI_ArrowArray offset based on the offset of offset buffer (#5895) * fix: Adjust FFI_ArrowArray offset based on the offset of offset buffer * For review * Fix clippy * For review --- arrow-array/src/ffi.rs | 56 ++++++++++++++++++++++++++++ arrow-buffer/src/buffer/immutable.rs | 13 +++++++ arrow-data/src/ffi.rs | 55 +++++++++++++++++++++++++-- 3 files changed, 120 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 088a0a6ab58a..fe755b91f765 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -1225,6 +1225,7 @@ mod tests_from_ffi { use std::sync::Arc; use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer}; + use arrow_data::transform::MutableArrayData; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; @@ -1234,6 +1235,7 @@ mod tests_from_ffi { Int32Array, Int64Array, StringArray, StructArray, UInt32Array, UInt64Array, }, ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}, + make_array, ArrayRef, }; use super::{ImportedArrowArray, Result}; @@ -1458,4 +1460,58 @@ mod tests_from_ffi { test_round_trip(&imported_array.consume()?) } + + fn roundtrip_string_array(array: StringArray) -> StringArray { + let data = array.into_data(); + + let array = FFI_ArrowArray::new(&data); + let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap(); + + let array = unsafe { from_ffi(array, &schema) }.unwrap(); + StringArray::from(array) + } + + fn extend_array(array: &dyn Array) -> ArrayRef { + let len = array.len(); + let data = array.to_data(); + + let mut mutable = MutableArrayData::new(vec![&data], false, len); + mutable.extend(0, 0, len); + make_array(mutable.freeze()) + } + + #[test] + fn test_extend_imported_string_slice() { + let mut strings = vec![]; + + for i in 0..1000 { + strings.push(format!("string: {}", i)); + } + + let string_array = StringArray::from(strings); + + let imported = roundtrip_string_array(string_array.clone()); + assert_eq!(imported.len(), 1000); + assert_eq!(imported.value(0), "string: 0"); + assert_eq!(imported.value(499), "string: 499"); + + let copied = extend_array(&imported); + assert_eq!( + copied.as_any().downcast_ref::().unwrap(), + &imported + ); + + let slice = string_array.slice(500, 500); + + let imported = roundtrip_string_array(slice); + assert_eq!(imported.len(), 500); + assert_eq!(imported.value(0), "string: 500"); + assert_eq!(imported.value(499), "string: 999"); + + let copied = extend_array(&imported); + assert_eq!( + copied.as_any().downcast_ref::().unwrap(), + &imported + ); + } } diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index f26cde05b7ab..2c743842fb05 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -71,6 +71,19 @@ impl Buffer { } } + /// Returns the offset, in bytes, of `Self::ptr` to `Self::data` + /// + /// self.ptr and self.data can be different after slicing or advancing the buffer. + pub fn ptr_offset(&self) -> usize { + // Safety: `ptr` is always in bounds of `data`. + unsafe { self.ptr.offset_from(self.data.ptr().as_ptr()) as usize } + } + + /// Returns the pointer to the start of the buffer without the offset. + pub fn data_ptr(&self) -> NonNull { + self.data.ptr() + } + /// Create a [`Buffer`] from the provided [`Vec`] without copying #[inline] pub fn from_vec(vec: Vec) -> Self { diff --git a/arrow-data/src/ffi.rs b/arrow-data/src/ffi.rs index 589f7dac6d19..7fe87d4ae107 100644 --- a/arrow-data/src/ffi.rs +++ b/arrow-data/src/ffi.rs @@ -131,6 +131,37 @@ impl FFI_ArrowArray { data.buffers().iter().map(|b| Some(b.clone())).collect() }; + // Handle buffer offset for offset buffer. + let offset_offset = match data.data_type() { + DataType::Utf8 | DataType::Binary => { + // Offset buffer is possible a slice of the buffer. + // If we use slice pointer as exported buffer pointer, it will cause + // the consumer to calculate incorrect length of data buffer (buffer 1). + // We need to get the offset of the offset buffer and fill it in + // the `FFI_ArrowArray` offset field. + Some(data.buffers()[0].ptr_offset() / std::mem::size_of::()) + } + DataType::LargeUtf8 | DataType::LargeBinary => { + // Offset buffer is possible a slice of the buffer. + // If we use slice pointer as exported buffer pointer, it will cause + // the consumer to calculate incorrect length of data buffer (buffer 1). + // We need to get the offset of the offset buffer and fill it in + // the `FFI_ArrowArray` offset field. + Some(data.buffers()[0].ptr_offset() / std::mem::size_of::()) + } + _ => None, + }; + + let offset = if let Some(offset) = offset_offset { + if data.offset() != 0 { + // TODO: Adjust for data offset + panic!("The ArrayData of a slice offset buffer should not have offset"); + } + offset + } else { + data.offset() + }; + // `n_buffers` is the number of buffers by the spec. let n_buffers = { data_layout.buffers.len() + { @@ -143,9 +174,25 @@ impl FFI_ArrowArray { let buffers_ptr = buffers .iter() - .flat_map(|maybe_buffer| match maybe_buffer { - // note that `raw_data` takes into account the buffer's offset - Some(b) => Some(b.as_ptr() as *const c_void), + .enumerate() + .flat_map(|(buffer_idx, maybe_buffer)| match maybe_buffer { + Some(b) => { + match (data.data_type(), buffer_idx) { + ( + DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Binary + | DataType::LargeBinary, + 1, + ) => { + // For offset buffer, take original pointer without offset. + // Buffer offset should be handled by `FFI_ArrowArray` offset field. + Some(b.data_ptr().as_ptr() as *const c_void) + } + // For other buffers, note that `raw_data` takes into account the buffer's offset + _ => Some(b.as_ptr() as *const c_void), + } + } // This is for null buffer. We only put a null pointer for // null buffer if by spec it can contain null mask. None if data_layout.can_contain_null_mask => Some(std::ptr::null()), @@ -186,7 +233,7 @@ impl FFI_ArrowArray { Self { length: data.len() as i64, null_count: null_count as i64, - offset: data.offset() as i64, + offset: offset as i64, n_buffers, n_children, buffers: private_data.buffers_ptr.as_mut_ptr(),