Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support for FFI of dictionary-encoded arrays (#267)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Aug 11, 2021
1 parent 01cba03 commit d988539
Show file tree
Hide file tree
Showing 14 changed files with 485 additions and 342 deletions.
16 changes: 15 additions & 1 deletion arrow-pyarrow-integration-testing/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_decimal_roundtrip(self):
data = [
round(decimal.Decimal(722.82), 2),
round(decimal.Decimal(-934.11), 2),
None
None,
]
a = pyarrow.array(data, pyarrow.decimal128(5, 2))
b = arrow_pyarrow_integration_testing.round_trip(a)
Expand Down Expand Up @@ -179,3 +179,17 @@ def test_list_list_array(self):
b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_dict(self):
"""
Python -> Rust -> Python
"""
a = pyarrow.array(
["a", "a", "b", None, "c"],
pyarrow.dictionary(pyarrow.int64(), pyarrow.utf8()),
)
b = arrow_pyarrow_integration_testing.round_trip(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type
2 changes: 1 addition & 1 deletion src/array/binary/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ unsafe impl<O: Offset> ToFfi for BinaryArray<O> {

unsafe impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryArray<O> {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let data_type = array.field()?.data_type().clone();
let expected = if O::is_large() {
DataType::LargeBinary
} else {
Expand Down
3 changes: 3 additions & 0 deletions src/array/boolean/ffi.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::{
array::{FromFfi, ToFfi},
datatypes::DataType,
ffi,
};

Expand All @@ -22,6 +23,8 @@ unsafe impl ToFfi for BooleanArray {

unsafe impl<A: ffi::ArrowArrayRef> FromFfi<A> for BooleanArray {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.field()?.data_type().clone();
assert_eq!(data_type, DataType::Boolean);
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
Expand Down
39 changes: 39 additions & 0 deletions src/array/dictionary/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
use crate::{
array::{FromFfi, PrimitiveArray, ToFfi},
error::Result,
ffi,
};

use super::{DictionaryArray, DictionaryKey};

unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
self.keys.buffers()
}

#[inline]
fn offset(&self) -> usize {
self.offset
}
}

unsafe impl<K: DictionaryKey, A: ffi::ArrowArrayRef> FromFfi<A> for DictionaryArray<K> {
fn try_from_ffi(array: A) -> Result<Self> {
// keys: similar to PrimitiveArray, but the datatype is the inner one
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
let mut values = unsafe { array.buffer::<K>(0) }?;

if offset > 0 {
values = values.slice(offset, length);
validity = validity.map(|x| x.slice(offset, length))
}
let keys = PrimitiveArray::<K>::from_data(K::DATA_TYPE, values, validity);
// values
let values = array.dictionary()?.unwrap();
let values = ffi::try_from(values)?.into();

Ok(DictionaryArray::<K>::from_data(keys, values))
}
}
14 changes: 2 additions & 12 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ use crate::{
types::{NativeType, NaturalDataType},
};

mod ffi;
mod iterator;
mod mutable;
pub use iterator::*;
pub use mutable::*;

use super::{ffi::ToFfi, new_empty_array, primitive::PrimitiveArray, Array};
use super::{new_empty_array, primitive::PrimitiveArray, Array};

/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.
pub trait DictionaryKey: NativeType + NaturalDataType + num::NumCast + num::FromPrimitive {}
Expand Down Expand Up @@ -143,14 +144,3 @@ where
write!(f, "}}")
}
}

unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
vec![self.keys.validity().as_ref().map(|x| x.as_ptr())]
}

#[inline]
fn offset(&self) -> usize {
self.offset
}
}
37 changes: 26 additions & 11 deletions src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,28 @@ pub unsafe trait FromFfi<T: ffi::ArrowArrayRef>: Sized {
macro_rules! ffi_dyn {
($array:expr, $ty:ty) => {{
let array = $array.as_any().downcast_ref::<$ty>().unwrap();
(array.buffers(), array.children())
(array.buffers(), array.children(), None)
}};
}

type BuffersChildren = (Vec<Option<std::ptr::NonNull<u8>>>, Vec<Arc<dyn Array>>);
macro_rules! ffi_dict_dyn {
($array:expr, $ty:ty) => {{
let array = $array.as_any().downcast_ref::<$ty>().unwrap();
(
array.buffers(),
array.children(),
Some(array.values().clone()),
)
}};
}

type BuffersChildren = (
Vec<Option<std::ptr::NonNull<u8>>>,
Vec<Arc<dyn Array>>,
Option<Arc<dyn Array>>,
);

pub fn buffers_children(array: &dyn Array) -> BuffersChildren {
pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren {
match array.data_type() {
DataType::Null => ffi_dyn!(array, NullArray),
DataType::Boolean => ffi_dyn!(array, BooleanArray),
Expand Down Expand Up @@ -72,14 +87,14 @@ pub fn buffers_children(array: &dyn Array) -> BuffersChildren {
DataType::Struct(_) => ffi_dyn!(array, StructArray),
DataType::Union(_) => unimplemented!(),
DataType::Dictionary(key_type, _) => match key_type.as_ref() {
DataType::Int8 => ffi_dyn!(array, DictionaryArray::<i8>),
DataType::Int16 => ffi_dyn!(array, DictionaryArray::<i16>),
DataType::Int32 => ffi_dyn!(array, DictionaryArray::<i32>),
DataType::Int64 => ffi_dyn!(array, DictionaryArray::<i64>),
DataType::UInt8 => ffi_dyn!(array, DictionaryArray::<u8>),
DataType::UInt16 => ffi_dyn!(array, DictionaryArray::<u16>),
DataType::UInt32 => ffi_dyn!(array, DictionaryArray::<u32>),
DataType::UInt64 => ffi_dyn!(array, DictionaryArray::<u64>),
DataType::Int8 => ffi_dict_dyn!(array, DictionaryArray::<i8>),
DataType::Int16 => ffi_dict_dyn!(array, DictionaryArray::<i16>),
DataType::Int32 => ffi_dict_dyn!(array, DictionaryArray::<i32>),
DataType::Int64 => ffi_dict_dyn!(array, DictionaryArray::<i64>),
DataType::UInt8 => ffi_dict_dyn!(array, DictionaryArray::<u8>),
DataType::UInt16 => ffi_dict_dyn!(array, DictionaryArray::<u16>),
DataType::UInt32 => ffi_dict_dyn!(array, DictionaryArray::<u32>),
DataType::UInt64 => ffi_dict_dyn!(array, DictionaryArray::<u64>),
_ => unreachable!(),
},
}
Expand Down
2 changes: 1 addition & 1 deletion src/array/list/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ unsafe impl<O: Offset> ToFfi for ListArray<O> {

unsafe impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for ListArray<O> {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let data_type = array.field()?.data_type().clone();
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
Expand Down
2 changes: 1 addition & 1 deletion src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ pub use specification::Offset;
pub use struct_::StructArray;
pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter};

pub(crate) use self::ffi::buffers_children;
pub(crate) use self::ffi::buffers_children_dictionary;
pub use self::ffi::FromFfi;
pub use self::ffi::ToFfi;

Expand Down
2 changes: 1 addition & 1 deletion src/array/primitive/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ unsafe impl<T: NativeType> ToFfi for PrimitiveArray<T> {

unsafe impl<T: NativeType, A: ffi::ArrowArrayRef> FromFfi<A> for PrimitiveArray<T> {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let data_type = array.field()?.data_type().clone();
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
Expand Down
8 changes: 4 additions & 4 deletions src/array/struct_.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ impl StructArray {
}

impl StructArray {
pub fn get_fields(datatype: &DataType) -> &[Field] {
if let DataType::Struct(fields) = datatype {
pub fn get_fields(data_type: &DataType) -> &[Field] {
if let DataType::Struct(fields) = data_type {
fields
} else {
panic!("Wrong datatype passed to Struct.")
Expand Down Expand Up @@ -139,8 +139,8 @@ unsafe impl ToFfi for StructArray {

unsafe impl<A: ffi::ArrowArrayRef> FromFfi<A> for StructArray {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let fields = Self::get_fields(&data_type).to_vec();
let field = array.field()?;
let fields = Self::get_fields(field.data_type()).to_vec();

let length = array.array().len();
let offset = array.array().offset();
Expand Down
30 changes: 26 additions & 4 deletions src/ffi/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::array::{BooleanArray, FromFfi};
use crate::error::{ArrowError, Result};
use crate::types::days_ms;
use crate::{
array::{Array, BinaryArray, ListArray, PrimitiveArray, StructArray, Utf8Array},
array::*,
datatypes::{DataType, IntervalUnit},
};

Expand All @@ -32,8 +32,8 @@ use crate::{
/// * the data type is not supported
/// * the interface is not valid (e.g. a null pointer)
pub fn try_from<A: ArrowArrayRef>(array: A) -> Result<Box<dyn Array>> {
let data_type = array.data_type()?;
let array: Box<dyn Array> = match data_type {
let field = array.field()?;
let array: Box<dyn Array> = match field.data_type() {
DataType::Boolean => Box::new(BooleanArray::try_from_ffi(array)?),
DataType::Int8 => Box::new(PrimitiveArray::<i8>::try_from_ffi(array)?),
DataType::Int16 => Box::new(PrimitiveArray::<i16>::try_from_ffi(array)?),
Expand Down Expand Up @@ -66,6 +66,17 @@ pub fn try_from<A: ArrowArrayRef>(array: A) -> Result<Box<dyn Array>> {
DataType::List(_) => Box::new(ListArray::<i32>::try_from_ffi(array)?),
DataType::LargeList(_) => Box::new(ListArray::<i64>::try_from_ffi(array)?),
DataType::Struct(_) => Box::new(StructArray::try_from_ffi(array)?),
DataType::Dictionary(keys, _) => match keys.as_ref() {
DataType::Int8 => Box::new(DictionaryArray::<i8>::try_from_ffi(array)?),
DataType::Int16 => Box::new(DictionaryArray::<i16>::try_from_ffi(array)?),
DataType::Int32 => Box::new(DictionaryArray::<i32>::try_from_ffi(array)?),
DataType::Int64 => Box::new(DictionaryArray::<i64>::try_from_ffi(array)?),
DataType::UInt8 => Box::new(DictionaryArray::<u8>::try_from_ffi(array)?),
DataType::UInt16 => Box::new(DictionaryArray::<u16>::try_from_ffi(array)?),
DataType::UInt32 => Box::new(DictionaryArray::<u32>::try_from_ffi(array)?),
DataType::UInt64 => Box::new(DictionaryArray::<u64>::try_from_ffi(array)?),
_ => unreachable!(),
},
data_type => {
return Err(ArrowError::NotYetImplemented(format!(
"Reading DataType \"{}\" is not yet supported.",
Expand All @@ -80,7 +91,6 @@ pub fn try_from<A: ArrowArrayRef>(array: A) -> Result<Box<dyn Array>> {
#[cfg(test)]
mod tests {
use super::*;
use crate::array::*;
use crate::datatypes::TimeUnit;
use crate::{error::Result, ffi};
use std::sync::Arc;
Expand Down Expand Up @@ -199,4 +209,16 @@ mod tests {

test_round_trip(array)
}

#[test]
fn test_dict() -> Result<()> {
let data = vec![Some("a"), Some("a"), None, Some("b")];

let mut array = MutableDictionaryArray::<i32, MutableUtf8Array<i32>>::new();
array.try_extend(data)?;

let array: DictionaryArray<i32> = array.into();

test_round_trip(array)
}
}
Loading

0 comments on commit d988539

Please sign in to comment.