Skip to content

Commit

Permalink
Directly decode String/BinaryView types from arrow-row format (#6044)
Browse files Browse the repository at this point in the history
* add string view bench

* check in new impl

* add utf8

* quick utf8 validation

* Update arrow-row/src/variable.rs

Co-authored-by: Andrew Lamb <[email protected]>

* address comments

* update

* Revert "address comments"

This reverts commit e2656c9.

* addr comments

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
XiangpengHao and alamb authored Jul 15, 2024
1 parent 0002b4d commit 074bcb5
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 16 deletions.
83 changes: 68 additions & 15 deletions arrow-row/src/variable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use arrow_buffer::bit_util::ceil;
use arrow_buffer::MutableBuffer;
use arrow_data::ArrayDataBuilder;
use arrow_schema::{DataType, SortOptions};
use builder::make_view;

/// The block size of the variable length encoding
pub const BLOCK_SIZE: usize = 32;
Expand Down Expand Up @@ -152,6 +153,8 @@ fn encode_blocks<const SIZE: usize>(out: &mut [u8], val: &[u8]) -> usize {
end_offset
}

/// Decodes a single block of data
/// The `f` function accepts a slice of the decoded data, it may be called multiple times
pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl FnMut(&[u8])) -> usize {
let (non_empty_sentinel, continuation) = match options.descending {
true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION),
Expand Down Expand Up @@ -243,6 +246,69 @@ pub fn decode_binary<I: OffsetSizeTrait>(
unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
}

fn decode_binary_view_inner(
rows: &mut [&[u8]],
options: SortOptions,
check_utf8: bool,
) -> BinaryViewArray {
let len = rows.len();

let mut null_count = 0;

let nulls = MutableBuffer::collect_bool(len, |x| {
let valid = rows[x][0] != null_sentinel(options);
null_count += !valid as usize;
valid
});

let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum();
let mut values = MutableBuffer::new(values_capacity);
let mut views = BufferBuilder::<u128>::new(len);

for row in rows {
let start_offset = values.len();
let offset = decode_blocks(row, options, |b| values.extend_from_slice(b));
if row[0] == null_sentinel(options) {
debug_assert_eq!(offset, 1);
debug_assert_eq!(start_offset, values.len());
views.append(0);
} else {
// Safety: we just appended the data to the end of the buffer
let val = unsafe { values.get_unchecked_mut(start_offset..) };

if options.descending {
val.iter_mut().for_each(|o| *o = !*o);
}

let view = make_view(val, 0, start_offset as u32);
views.append(view);
}
*row = &row[offset..];
}

if check_utf8 {
// the values contains all data, no matter if it is short or long
// we can validate utf8 in one go.
std::str::from_utf8(values.as_slice()).unwrap();
}

let builder = ArrayDataBuilder::new(DataType::BinaryView)
.len(len)
.null_count(null_count)
.null_bit_buffer(Some(nulls.into()))
.add_buffer(views.finish())
.add_buffer(values.into());

// SAFETY:
// Valid by construction above
unsafe { BinaryViewArray::from(builder.build_unchecked()) }
}

/// Decodes a binary view array from `rows` with the provided `options`
pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray {
decode_binary_view_inner(rows, options, false)
}

/// Decodes a string array from `rows` with the provided `options`
///
/// # Safety
Expand All @@ -269,16 +335,6 @@ pub unsafe fn decode_string<I: OffsetSizeTrait>(
GenericStringArray::from(builder.build_unchecked())
}

/// Decodes a binary view array from `rows` with the provided `options`
pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray {
let decoded: GenericBinaryArray<i64> = decode_binary(rows, options);

// Better performance might be to directly build the binary view instead of building to BinaryArray and then casting
// I suspect that the overhead is not a big deal.
// If it is, we can reimplement the `decode_binary_view` function to directly build the StringViewArray
BinaryViewArray::from(&decoded)
}

/// Decodes a string view array from `rows` with the provided `options`
///
/// # Safety
Expand All @@ -289,9 +345,6 @@ pub unsafe fn decode_string_view(
options: SortOptions,
validate_utf8: bool,
) -> StringViewArray {
let decoded: GenericStringArray<i64> = decode_string(rows, options, validate_utf8);
// Better performance might be to directly build the string view instead of building to StringArray and then casting
// I suspect that the overhead is not a big deal.
// If it is, we can reimplement the `decode_string_view` function to directly build the StringViewArray
StringViewArray::from(&decoded)
let view = decode_binary_view_inner(rows, options, validate_utf8);
view.to_string_view_unchecked()
}
14 changes: 13 additions & 1 deletion arrow/benches/row_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use arrow::datatypes::{Int64Type, UInt64Type};
use arrow::row::{RowConverter, SortField};
use arrow::util::bench_util::{
create_boolean_array, create_dict_from_values, create_primitive_array,
create_string_array_with_len, create_string_dict_array,
create_string_array_with_len, create_string_dict_array, create_string_view_array_with_len,
};
use arrow_array::types::Int32Type;
use arrow_array::Array;
Expand Down Expand Up @@ -87,6 +87,18 @@ fn row_bench(c: &mut Criterion) {
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 100)) as ArrayRef];
do_bench(c, "4096 string(100, 0.5)", cols);

let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, false)) as ArrayRef];
do_bench(c, "4096 string view(10, 0)", cols);

let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef];
do_bench(c, "4096 string view(30, 0)", cols);

let cols = vec![Arc::new(create_string_view_array_with_len(40960, 0., 100, false)) as ArrayRef];
do_bench(c, "40960 string view(100, 0)", cols);

let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, false)) as ArrayRef];
do_bench(c, "4096 string view(100, 0.5)", cols);

let cols = vec![Arc::new(create_string_dict_array::<Int32Type>(4096, 0., 10)) as ArrayRef];
do_bench(c, "4096 string_dictionary(10, 0)", cols);

Expand Down

0 comments on commit 074bcb5

Please sign in to comment.