From d9f574e04fa81b4995068ef4230173ccee2aa75f Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Mon, 17 Jan 2022 23:32:56 +0800 Subject: [PATCH] add from_iter_values for binary array (#1188) --- arrow/src/array/array_binary.rs | 60 +++++++++++++++++++++++---------- arrow/src/array/array_string.rs | 7 ++-- 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index f5b25acb1753..46c7066585e3 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -124,22 +124,10 @@ impl GenericBinaryArray { } /// Creates a [GenericBinaryArray] from a vector of byte slices + /// + /// See also [`Self::from_iter_values`] pub fn from_vec(v: Vec<&[u8]>) -> Self { - let mut offsets = Vec::with_capacity(v.len() + 1); - let mut values = Vec::new(); - let mut length_so_far: OffsetSize = OffsetSize::zero(); - offsets.push(length_so_far); - for s in &v { - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s); - } - let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(v.len()) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)); - let array_data = unsafe { array_data.build_unchecked() }; - GenericBinaryArray::::from(array_data) + Self::from_iter_values(v) } /// Creates a [GenericBinaryArray] from a vector of Optional (null) byte slices @@ -171,6 +159,42 @@ impl GenericBinaryArray { let data = unsafe { builder.build_unchecked() }; Self::from(data) } + + /// Creates a `GenericBinaryArray` based on an iterator of values without nulls + pub fn from_iter_values(iter: I) -> Self + where + Ptr: AsRef<[u8]>, + I: IntoIterator, + { + let iter = iter.into_iter(); + let (_, data_len) = iter.size_hint(); + let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. + + let mut offsets = + MutableBuffer::new((data_len + 1) * std::mem::size_of::()); + let mut values = MutableBuffer::new(0); + + let mut length_so_far = OffsetSize::zero(); + offsets.push(length_so_far); + + for s in iter { + let s = s.as_ref(); + length_so_far += OffsetSize::from_usize(s.len()).unwrap(); + offsets.push(length_so_far); + values.extend_from_slice(s); + } + + // iterator size hint may not be correct so compute the actual number of offsets + assert!(!offsets.is_empty()); // wrote at least one + let actual_len = (offsets.len() / std::mem::size_of::()) - 1; + + let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) + .len(actual_len) + .add_buffer(offsets.into()) + .add_buffer(values.into()); + let array_data = unsafe { array_data.build_unchecked() }; + Self::from(array_data) + } } impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryArray { @@ -359,7 +383,7 @@ impl From>> for GenericBinaryArray { fn from(v: Vec>) -> Self { - GenericBinaryArray::::from_opt_vec(v) + Self::from_opt_vec(v) } } @@ -367,13 +391,13 @@ impl From> for GenericBinaryArray { fn from(v: Vec<&[u8]>) -> Self { - GenericBinaryArray::::from_vec(v) + Self::from_iter_values(v) } } impl From> for GenericBinaryArray { fn from(v: GenericListArray) -> Self { - GenericBinaryArray::::from_list(v) + Self::from_list(v) } } diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 6af8bae073f9..1477eaa8ce90 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -138,9 +138,10 @@ impl GenericStringArray { } /// Creates a `GenericStringArray` based on an iterator of values without nulls - pub fn from_iter_values>(iter: I) -> Self + pub fn from_iter_values(iter: I) -> Self where Ptr: AsRef, + I: IntoIterator, { let iter = iter.into_iter(); let (_, data_len) = iter.size_hint(); @@ -306,7 +307,7 @@ impl From> for GenericStringArray { fn from(v: Vec<&str>) -> Self { - GenericStringArray::::from_iter_values(v) + Self::from_iter_values(v) } } @@ -314,7 +315,7 @@ impl From> for GenericStringArray { fn from(v: Vec) -> Self { - GenericStringArray::::from_iter_values(v) + Self::from_iter_values(v) } }