Skip to content

Commit

Permalink
Let the StringBuilder use BinaryBuilder (#2181)
Browse files Browse the repository at this point in the history
* update string builder

Signed-off-by: remzi <[email protected]>

* update tests and fmt

Signed-off-by: remzi <[email protected]>
  • Loading branch information
HaoYang670 authored Jul 28, 2022
1 parent 82e0512 commit 9e47779
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 91 deletions.
31 changes: 23 additions & 8 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ use std::fmt;
use std::{any::Any, iter::FromIterator};

use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray,
GenericStringIter, OffsetSizeTrait,
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData,
GenericBinaryArray, GenericListArray, GenericStringIter, OffsetSizeTrait,
};
use crate::array::array::ArrayAccessor;
use crate::buffer::Buffer;
Expand Down Expand Up @@ -313,6 +313,27 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor
}
}

impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericListArray<OffsetSize>) -> Self {
GenericStringArray::<OffsetSize>::from_list(v)
}
}

impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
for GenericStringArray<OffsetSize>
{
fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
let builder = v
.into_data()
.into_builder()
.data_type(Self::get_data_type());
let data = unsafe { builder.build_unchecked() };
Self::from(data)
}
}

impl<OffsetSize: OffsetSizeTrait> From<ArrayData> for GenericStringArray<OffsetSize> {
fn from(data: ArrayData) -> Self {
assert_eq!(
Expand Down Expand Up @@ -385,12 +406,6 @@ pub type StringArray = GenericStringArray<i32>;
/// ```
pub type LargeStringArray = GenericStringArray<i64>;

impl<T: OffsetSizeTrait> From<GenericListArray<T>> for GenericStringArray<T> {
fn from(v: GenericListArray<T>) -> Self {
GenericStringArray::<T>::from_list(v)
}
}

#[cfg(test)]
mod tests {

Expand Down
23 changes: 23 additions & 0 deletions arrow/src/array/builder/generic_binary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
}
}

/// Creates a new [`GenericBinaryBuilder`],
/// `item_capacity` is the number of items to pre-allocate space for in this builder
/// `data_capacity` is the number of bytes to pre-allocate space for in this builder
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let mut offsets_builder = BufferBuilder::<OffsetSize>::new(item_capacity + 1);
offsets_builder.append(OffsetSize::zero());
Self {
value_builder: UInt8BufferBuilder::new(data_capacity),
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(item_capacity),
}
}

/// Appends a byte slice into the builder.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
Expand Down Expand Up @@ -82,6 +95,16 @@ impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
let array_data = unsafe { array_builder.build_unchecked() };
GenericBinaryArray::<OffsetSize>::from(array_data)
}

/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[u8] {
self.value_builder.as_slice()
}

/// Returns the current offsets buffer as a slice
pub fn offsets_slice(&self) -> &[OffsetSize] {
self.offsets_builder.as_slice()
}
}

impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericBinaryBuilder<OffsetSize> {
Expand Down
145 changes: 62 additions & 83 deletions arrow/src/array/builder/generic_string_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,60 +15,46 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::{
ArrayBuilder, ArrayRef, GenericListBuilder, GenericStringArray, OffsetSizeTrait,
UInt8Builder,
};
use crate::array::{ArrayBuilder, ArrayRef, GenericStringArray, OffsetSizeTrait};
use std::any::Any;
use std::sync::Arc;

use super::GenericBinaryBuilder;

/// Array builder for [`GenericStringArray`]
#[derive(Debug)]
pub struct GenericStringBuilder<OffsetSize: OffsetSizeTrait> {
builder: GenericListBuilder<OffsetSize, UInt8Builder>,
builder: GenericBinaryBuilder<OffsetSize>,
}

impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
/// Creates a new `StringBuilder`,
/// Creates a new [`GenericStringBuilder`],
/// `capacity` is the number of bytes of string data to pre-allocate space for in this builder
pub fn new(capacity: usize) -> Self {
let values_builder = UInt8Builder::new(capacity);
Self {
builder: GenericListBuilder::new(values_builder),
builder: GenericBinaryBuilder::new(capacity),
}
}

/// Creates a new `StringBuilder`,
/// Creates a new [`GenericStringBuilder`],
/// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder
/// `item_capacity` is the number of items to pre-allocate space for in this builder
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let values_builder = UInt8Builder::new(data_capacity);
Self {
builder: GenericListBuilder::with_capacity(values_builder, item_capacity),
builder: GenericBinaryBuilder::with_capacity(item_capacity, data_capacity),
}
}

/// Appends a string into the builder.
///
/// Automatically calls the `append` method to delimit the string appended in as a
/// distinct array element.
#[inline]
pub fn append_value(&mut self, value: impl AsRef<str>) {
self.builder
.values()
.append_slice(value.as_ref().as_bytes());
self.builder.append(true);
}

/// Finish the current variable-length list array slot.
#[inline]
pub fn append(&mut self, is_valid: bool) {
self.builder.append(is_valid)
self.builder.append_value(value.as_ref().as_bytes());
}

/// Append a null value to the array.
#[inline]
pub fn append_null(&mut self) {
self.append(false)
self.builder.append_null()
}

/// Append an `Option` value to the array.
Expand All @@ -80,14 +66,14 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringBuilder<OffsetSize> {
};
}

/// Builds the `StringArray` and reset this builder.
/// Builds the [`GenericStringArray`] and reset this builder.
pub fn finish(&mut self) -> GenericStringArray<OffsetSize> {
GenericStringArray::<OffsetSize>::from(self.builder.finish())
}

/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[u8] {
self.builder.values_ref().values_slice()
self.builder.values_slice()
}

/// Returns the current offsets buffer as a slice
Expand Down Expand Up @@ -131,79 +117,72 @@ impl<OffsetSize: OffsetSizeTrait> ArrayBuilder for GenericStringBuilder<OffsetSi

#[cfg(test)]
mod tests {
use crate::array::builder::StringBuilder;
use crate::array::{Array, ArrayBuilder};
use super::*;
use crate::array::{Array, ArrayBuilder, OffsetSizeTrait};

#[test]
fn test_string_array_builder() {
let mut builder = StringBuilder::new(20);
fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::new(20);
let owned = "arrow".to_owned();

builder.append_value("hello");
builder.append(true);
builder.append_value("world");

let string_array = builder.finish();
builder.append_value("");
builder.append_value(&owned);
builder.append_null();
builder.append_option(Some("rust"));
builder.append_option(None::<&str>);
builder.append_option(None::<String>);
assert_eq!(7, builder.len());

assert_eq!(
GenericStringArray::<O>::from(vec![
Some("hello"),
Some(""),
Some("arrow"),
None,
Some("rust"),
None,
None
]),
builder.finish()
);
}

assert_eq!(3, string_array.len());
assert_eq!(0, string_array.null_count());
assert_eq!("hello", string_array.value(0));
assert_eq!("", string_array.value(1));
assert_eq!("world", string_array.value(2));
assert_eq!(5, string_array.value_offsets()[2]);
assert_eq!(5, string_array.value_length(2));
#[test]
fn test_string_array_builder() {
_test_generic_string_array_builder::<i32>()
}

#[test]
fn test_string_array_builder_finish() {
let mut builder = StringBuilder::new(10);
fn test_large_string_array_builder() {
_test_generic_string_array_builder::<i64>()
}

fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);

builder.append_value("hello");
builder.append_value("world");
builder.append_value("rust");
builder.append_null();

let mut arr = builder.finish();
assert_eq!(2, arr.len());
assert_eq!(0, builder.len());
builder.finish();
assert!(builder.is_empty());
assert_eq!(&[O::zero()], builder.offsets_slice());

builder.append_value("arrow");
arr = builder.finish();
assert_eq!(1, arr.len());
assert_eq!(0, builder.len());
builder.append_value("parquet");
let arr = builder.finish();
// array should not have null buffer because there is not `null` value.
assert_eq!(None, arr.data().null_buffer());
assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
}

#[test]
fn test_string_array_builder_append_string() {
let mut builder = StringBuilder::new(20);

let var = "hello".to_owned();
builder.append_value(&var);
builder.append(true);
builder.append_value("world");

let string_array = builder.finish();

assert_eq!(3, string_array.len());
assert_eq!(0, string_array.null_count());
assert_eq!("hello", string_array.value(0));
assert_eq!("", string_array.value(1));
assert_eq!("world", string_array.value(2));
assert_eq!(5, string_array.value_offsets()[2]);
assert_eq!(5, string_array.value_length(2));
fn test_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i32>()
}

#[test]
fn test_string_array_builder_append_option() {
let mut builder = StringBuilder::new(20);
builder.append_option(Some("hello"));
builder.append_option(None::<&str>);
builder.append_option(None::<String>);
builder.append_option(Some("world"));

let string_array = builder.finish();

assert_eq!(4, string_array.len());
assert_eq!("hello", string_array.value(0));
assert!(string_array.is_null(1));
assert!(string_array.is_null(2));
assert_eq!("world", string_array.value(3));
fn test_large_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i64>()
}
}

0 comments on commit 9e47779

Please sign in to comment.