From 074bcb5793e466c54ef7d81e9675a69aa16d2f6c Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 15 Jul 2024 07:11:36 -0400 Subject: [PATCH] Directly decode String/BinaryView types from arrow-row format (#6044) * add string view bench * check in new impl * add utf8 * quick utf8 validation * Update arrow-row/src/variable.rs Co-authored-by: Andrew Lamb * address comments * update * Revert "address comments" This reverts commit e2656c94dd5ff4fb2f486278feb346d44a7f5436. * addr comments --------- Co-authored-by: Andrew Lamb --- arrow-row/src/variable.rs | 83 ++++++++++++++++++++++++++++++------- arrow/benches/row_format.rs | 14 ++++++- 2 files changed, 81 insertions(+), 16 deletions(-) diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index c5aa7d8ac32..4d4bcddc080 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -22,6 +22,7 @@ use arrow_buffer::bit_util::ceil; use arrow_buffer::MutableBuffer; use arrow_data::ArrayDataBuilder; use arrow_schema::{DataType, SortOptions}; +use builder::make_view; /// The block size of the variable length encoding pub const BLOCK_SIZE: usize = 32; @@ -152,6 +153,8 @@ fn encode_blocks(out: &mut [u8], val: &[u8]) -> usize { end_offset } +/// Decodes a single block of data +/// The `f` function accepts a slice of the decoded data, it may be called multiple times pub fn decode_blocks(row: &[u8], options: SortOptions, mut f: impl FnMut(&[u8])) -> usize { let (non_empty_sentinel, continuation) = match options.descending { true => (!NON_EMPTY_SENTINEL, !BLOCK_CONTINUATION), @@ -243,6 +246,69 @@ pub fn decode_binary( unsafe { GenericBinaryArray::from(builder.build_unchecked()) } } +fn decode_binary_view_inner( + rows: &mut [&[u8]], + options: SortOptions, + check_utf8: bool, +) -> BinaryViewArray { + let len = rows.len(); + + let mut null_count = 0; + + let nulls = MutableBuffer::collect_bool(len, |x| { + let valid = rows[x][0] != null_sentinel(options); + null_count += !valid as usize; + valid + }); + + let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum(); + let mut values = MutableBuffer::new(values_capacity); + let mut views = BufferBuilder::::new(len); + + for row in rows { + let start_offset = values.len(); + let offset = decode_blocks(row, options, |b| values.extend_from_slice(b)); + if row[0] == null_sentinel(options) { + debug_assert_eq!(offset, 1); + debug_assert_eq!(start_offset, values.len()); + views.append(0); + } else { + // Safety: we just appended the data to the end of the buffer + let val = unsafe { values.get_unchecked_mut(start_offset..) }; + + if options.descending { + val.iter_mut().for_each(|o| *o = !*o); + } + + let view = make_view(val, 0, start_offset as u32); + views.append(view); + } + *row = &row[offset..]; + } + + if check_utf8 { + // the values contains all data, no matter if it is short or long + // we can validate utf8 in one go. + std::str::from_utf8(values.as_slice()).unwrap(); + } + + let builder = ArrayDataBuilder::new(DataType::BinaryView) + .len(len) + .null_count(null_count) + .null_bit_buffer(Some(nulls.into())) + .add_buffer(views.finish()) + .add_buffer(values.into()); + + // SAFETY: + // Valid by construction above + unsafe { BinaryViewArray::from(builder.build_unchecked()) } +} + +/// Decodes a binary view array from `rows` with the provided `options` +pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray { + decode_binary_view_inner(rows, options, false) +} + /// Decodes a string array from `rows` with the provided `options` /// /// # Safety @@ -269,16 +335,6 @@ pub unsafe fn decode_string( GenericStringArray::from(builder.build_unchecked()) } -/// Decodes a binary view array from `rows` with the provided `options` -pub fn decode_binary_view(rows: &mut [&[u8]], options: SortOptions) -> BinaryViewArray { - let decoded: GenericBinaryArray = decode_binary(rows, options); - - // Better performance might be to directly build the binary view instead of building to BinaryArray and then casting - // I suspect that the overhead is not a big deal. - // If it is, we can reimplement the `decode_binary_view` function to directly build the StringViewArray - BinaryViewArray::from(&decoded) -} - /// Decodes a string view array from `rows` with the provided `options` /// /// # Safety @@ -289,9 +345,6 @@ pub unsafe fn decode_string_view( options: SortOptions, validate_utf8: bool, ) -> StringViewArray { - let decoded: GenericStringArray = decode_string(rows, options, validate_utf8); - // Better performance might be to directly build the string view instead of building to StringArray and then casting - // I suspect that the overhead is not a big deal. - // If it is, we can reimplement the `decode_string_view` function to directly build the StringViewArray - StringViewArray::from(&decoded) + let view = decode_binary_view_inner(rows, options, validate_utf8); + view.to_string_view_unchecked() } diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index b5298cbe367..0fb63b5b324 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -24,7 +24,7 @@ use arrow::datatypes::{Int64Type, UInt64Type}; use arrow::row::{RowConverter, SortField}; use arrow::util::bench_util::{ create_boolean_array, create_dict_from_values, create_primitive_array, - create_string_array_with_len, create_string_dict_array, + create_string_array_with_len, create_string_dict_array, create_string_view_array_with_len, }; use arrow_array::types::Int32Type; use arrow_array::Array; @@ -87,6 +87,18 @@ fn row_bench(c: &mut Criterion) { let cols = vec![Arc::new(create_string_array_with_len::(4096, 0.5, 100)) as ArrayRef]; do_bench(c, "4096 string(100, 0.5)", cols); + let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, false)) as ArrayRef]; + do_bench(c, "4096 string view(10, 0)", cols); + + let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as ArrayRef]; + do_bench(c, "4096 string view(30, 0)", cols); + + let cols = vec![Arc::new(create_string_view_array_with_len(40960, 0., 100, false)) as ArrayRef]; + do_bench(c, "40960 string view(100, 0)", cols); + + let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, false)) as ArrayRef]; + do_bench(c, "4096 string view(100, 0.5)", cols); + let cols = vec![Arc::new(create_string_dict_array::(4096, 0., 10)) as ArrayRef]; do_bench(c, "4096 string_dictionary(10, 0)", cols);