From fb0c2d710881a81ec03b9e89e33df5e9c4aa4c62 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Fri, 4 Feb 2022 17:44:36 +0800 Subject: [PATCH 1/8] Row backed by raw bytes --- datafusion/src/lib.rs | 1 + datafusion/src/row/bitmap/fmt.rs | 132 ++++++++++++++++++++++ datafusion/src/row/bitmap/mod.rs | 135 ++++++++++++++++++++++ datafusion/src/row/mod.rs | 186 +++++++++++++++++++++++++++++++ 4 files changed, 454 insertions(+) create mode 100644 datafusion/src/row/bitmap/fmt.rs create mode 100644 datafusion/src/row/bitmap/mod.rs create mode 100644 datafusion/src/row/mod.rs diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 9442f7e5fe9f..1d5ec13d2035 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -223,6 +223,7 @@ pub use arrow; pub use parquet; pub(crate) mod field_util; +pub(crate) mod row; #[cfg(feature = "pyarrow")] mod pyarrow; diff --git a/datafusion/src/row/bitmap/fmt.rs b/datafusion/src/row/bitmap/fmt.rs new file mode 100644 index 000000000000..0dbc81ba1234 --- /dev/null +++ b/datafusion/src/row/bitmap/fmt.rs @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fmt::Write; + +use super::is_set; + +/// Formats `bytes` taking into account an offset and length of the form +pub fn fmt( + bytes: &[u8], + offset: usize, + length: usize, + f: &mut std::fmt::Formatter<'_>, +) -> std::fmt::Result { + assert!(offset < 8); + + f.write_char('[')?; + let mut remaining = length; + if remaining == 0 { + f.write_char(']')?; + return Ok(()); + } + + let first = bytes[0]; + let bytes = &bytes[1..]; + let empty_before = 8usize.saturating_sub(remaining + offset); + f.write_str("0b")?; + for _ in 0..empty_before { + f.write_char('_')?; + } + let until = std::cmp::min(8, offset + remaining); + for i in offset..until { + if is_set(first, offset + until - 1 - i) { + f.write_char('1')?; + } else { + f.write_char('0')?; + } + } + for _ in 0..offset { + f.write_char('_')?; + } + remaining -= until - offset; + + if remaining == 0 { + f.write_char(']')?; + return Ok(()); + } + + let number_of_bytes = remaining / 8; + for byte in &bytes[..number_of_bytes] { + f.write_str(", ")?; + f.write_fmt(format_args!("{:#010b}", byte))?; + } + remaining -= number_of_bytes * 8; + if remaining == 0 { + f.write_char(']')?; + return Ok(()); + } + + let last = bytes[std::cmp::min((length + offset + 7) / 8, bytes.len() - 1)]; + let remaining = (length + offset) % 8; + f.write_str(", ")?; + f.write_str("0b")?; + for _ in 0..(8 - remaining) { + f.write_char('_')?; + } + for i in 0..remaining { + if is_set(last, remaining - 1 - i) { + f.write_char('1')?; + } else { + f.write_char('0')?; + } + } + f.write_char(']') +} + +#[cfg(test)] +mod tests { + use super::*; + + struct A<'a>(&'a [u8], usize, usize); + impl<'a> std::fmt::Debug for A<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fmt(self.0, self.1, self.2, f) + } + } + + #[test] + fn test_debug() -> std::fmt::Result { + assert_eq!(format!("{:?}", A(&[1], 0, 0)), "[]"); + assert_eq!(format!("{:?}", A(&[0b11000001], 0, 8)), "[0b11000001]"); + assert_eq!( + format!("{:?}", A(&[0b11000001, 1], 0, 9)), + "[0b11000001, 0b_______1]" + ); + assert_eq!(format!("{:?}", A(&[1], 0, 2)), "[0b______01]"); + assert_eq!(format!("{:?}", A(&[1], 1, 2)), "[0b_____00_]"); + assert_eq!(format!("{:?}", A(&[1], 2, 2)), "[0b____00__]"); + assert_eq!(format!("{:?}", A(&[1], 3, 2)), "[0b___00___]"); + assert_eq!(format!("{:?}", A(&[1], 4, 2)), "[0b__00____]"); + assert_eq!(format!("{:?}", A(&[1], 5, 2)), "[0b_00_____]"); + assert_eq!(format!("{:?}", A(&[1], 6, 2)), "[0b00______]"); + assert_eq!( + format!("{:?}", A(&[0b11000001, 1], 1, 9)), + "[0b1100000_, 0b______01]" + ); + // extra bytes are ignored + assert_eq!( + format!("{:?}", A(&[0b11000001, 1, 1, 1], 1, 9)), + "[0b1100000_, 0b______01]" + ); + assert_eq!( + format!("{:?}", A(&[0b11000001, 1, 1], 2, 16)), + "[0b110000__, 0b00000001, 0b______01]" + ); + Ok(()) + } +} diff --git a/datafusion/src/row/bitmap/mod.rs b/datafusion/src/row/bitmap/mod.rs new file mode 100644 index 000000000000..13c6e09aaaaa --- /dev/null +++ b/datafusion/src/row/bitmap/mod.rs @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! General utilities for null bit section handling +//! +//! Note: this is a tailored version based on [arrow2 bitmap utils](https://github.com/jorgecarleitao/arrow2/tree/main/src/bitmap/utils) + +mod fmt; + +const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; +const UNSET_BIT_MASK: [u8; 8] = [ + 255 - 1, + 255 - 2, + 255 - 4, + 255 - 8, + 255 - 16, + 255 - 32, + 255 - 64, + 255 - 128, +]; + +/// Returns whether bit at position `i` in `byte` is set or not +#[inline] +pub fn is_set(byte: u8, i: usize) -> bool { + (byte & BIT_MASK[i]) != 0 +} + +/// Sets bit at position `i` in `byte` +#[inline] +pub fn set(byte: u8, i: usize, value: bool) -> u8 { + if value { + byte | BIT_MASK[i] + } else { + byte & UNSET_BIT_MASK[i] + } +} + +/// Sets bit at position `i` in `data` +#[inline] +pub fn set_bit(data: &mut [u8], i: usize, value: bool) { + data[i / 8] = set(data[i / 8], i % 8, value); +} + +/// Returns whether bit at position `i` in `data` is set or not +#[inline] +pub fn get_bit(data: &[u8], i: usize) -> bool { + is_set(data[i / 8], i % 8) +} + +/// Returns whether bit at position `i` in `data` is set or not. +/// +/// # Safety +/// `i >= data.len() * 8` results in undefined behavior +#[inline] +pub unsafe fn get_bit_unchecked(data: &[u8], i: usize) -> bool { + (*data.as_ptr().add(i >> 3) & BIT_MASK[i & 7]) != 0 +} + +/// Returns the number of bytes required to hold `bits` bits. +#[inline] +pub fn bytes_for(bits: usize) -> usize { + bits.saturating_add(7) / 8 +} + +/// Returns the number of zero bits in the slice offsetted by `offset` and a length of `length`. +/// # Panics +/// This function panics iff `(offset + len).saturating_add(7) / 8 >= slice.len()` +/// because it corresponds to the situation where `len` is beyond bounds. +pub fn count_zeros(slice: &[u8], offset: usize, len: usize) -> usize { + if len == 0 { + return 0; + }; + + let mut slice = &slice[offset / 8..(offset + len).saturating_add(7) / 8]; + let offset = offset % 8; + + if (offset + len) / 8 == 0 { + // all within a single byte + let byte = (slice[0] >> offset) << (8 - len); + return len - byte.count_ones() as usize; + } + + // slice: [a1,a2,a3,a4], [a5,a6,a7,a8] + // offset: 3 + // len: 4 + // [__,__,__,a4], [a5,a6,a7,__] + let mut set_count = 0; + if offset != 0 { + // count all ignoring the first `offset` bits + // i.e. [__,__,__,a4] + set_count += (slice[0] >> offset).count_ones() as usize; + slice = &slice[1..]; + } + if (offset + len) % 8 != 0 { + let end_offset = (offset + len) % 8; // i.e. 3 + 4 = 7 + let last_index = slice.len() - 1; + // count all ignoring the last `offset` bits + // i.e. [a5,a6,a7,__] + set_count += (slice[last_index] << (8 - end_offset)).count_ones() as usize; + slice = &slice[..last_index]; + } + + // finally, count any and all bytes in the middle in groups of 8 + let mut chunks = slice.chunks_exact(8); + set_count += chunks + .by_ref() + .map(|chunk| { + let a = u64::from_ne_bytes(chunk.try_into().unwrap()); + a.count_ones() as usize + }) + .sum::(); + + // and any bytes that do not fit in the group + set_count += chunks + .remainder() + .iter() + .map(|byte| byte.count_ones() as usize) + .sum::(); + + len - set_count +} diff --git a/datafusion/src/row/mod.rs b/datafusion/src/row/mod.rs new file mode 100644 index 000000000000..edfe02274dd9 --- /dev/null +++ b/datafusion/src/row/mod.rs @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An implementation of Row backed by raw bytes +//! +//! Each tuple consists of up to three parts: [null bit set] [values] [var length data] +//! +//! The null bit set is used for null tracking and is aligned to 1-byte. It stores +//! one bit per field. +//! +//! In the region of the values, we store the fields in the order they are defined in the schema. +//! - For fixed-length, sequential access fields, we store them directly. +//! E.g., 4 bytes for int and 1 byte for bool. +//! - For fixed-length, update often fields, we store one 8-byte word per field. +//! - For fields of non-primitive or variable-length types, +//! we append their actual content to the end of the var length region and +//! store their offset relative to row base and their length, packed into an 8-byte word. + +use crate::row::bitmap::{get_bit, set_bit}; +use arrow::datatypes::{DataType, Schema}; +use std::sync::Arc; + +mod bitmap; + +struct Row<'a> { + data: &'a mut [u8], + base_offset: usize, + length: usize, + schema: Arc, + null_width: usize, + values_width: usize, + varlena_width: usize, + field_offsets: Vec, +} + +impl<'a> Row<'a> { + pub fn new(schema: Arc) -> Self { + assert!(supported(&schema)); + let null_width = null_width(schema.fields().len()); + let (field_offsets, values_width) = get_offsets(null_width, &schema); + Self { + data: &mut [], + base_offset: 0, + length: 0, + schema, + null_width, + values_width, + varlena_width: 0, + field_offsets, + } + } + + /// Update this row to point to position `offset` in `base` + pub fn point_to(&mut self, base: &'a mut [u8], offset: usize) { + self.data = base; + self.base_offset = offset; + } + + #[inline] + fn assert_index_valid(&self, idx: usize) { + assert!(idx < self.schema.fields().len()); + } + + // ---------------------- + // Accessors + // ---------------------- + + fn is_valid_at(&self, idx: usize) -> bool { + let null_bits = &self.data[self.base_offset..self.base_offset + self.null_width]; + get_bit(null_bits, idx) + } + + // ---------------------- + // Mutators + // ---------------------- + + fn set_null_at(&mut self, idx: usize) { + let null_bits = + &mut self.data[self.base_offset..self.base_offset + self.null_width]; + set_bit(null_bits, idx, false) + } + + fn set_non_null_at(&mut self, idx: usize) { + let null_bits = + &mut self.data[self.base_offset..self.base_offset + self.null_width]; + set_bit(null_bits, idx, true) + } + + /// End each row at 8-byte word boundary. + fn end_padding(&mut self) { + let payload_width = self.null_width + self.values_width + self.varlena_width; + self.length = (payload_width.saturating_add(7) / 8) * 8; + } +} + +/// Get number of bytes needed for null bit set +fn null_width(num_fields: usize) -> usize { + num_fields.saturating_add(7) / 8 +} + +/// Get relative offsets for each field and total width for values +fn get_offsets(null_width: usize, schema: &Arc) -> (Vec, usize) { + let mut offsets = vec![]; + let mut offset = null_width; + for f in schema.fields() { + offsets.push(offset); + offset += type_width(f.data_type()); + } + (offsets, offset - null_width) +} + +fn supported_type(dt: &DataType) -> bool { + use DataType::*; + matches!( + dt, + Boolean + | UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float16 + | Float32 + | Float64 + | Date32 + | Date64 + | Utf8 + | Binary + ) +} + +fn var_length(dt: &DataType) -> bool { + use DataType::*; + matches!(dt, Utf8 | Binary) +} + +fn type_width(dt: &DataType) -> usize { + use DataType::*; + if var_length(dt) { + return 8; + } + match dt { + Boolean | UInt8 | Int8 => 1, + UInt16 | Int16 | Float16 => 2, + UInt32 | Int32 | Float32 | Date32 => 4, + UInt64 | Int64 | Float64 | Date64 => 8, + _ => unreachable!(), + } +} + +fn fixed_size(schema: &Arc) -> bool { + schema.fields().iter().all(|f| !var_length(f.data_type())) +} + +fn supported(schema: &Arc) -> bool { + schema + .fields() + .iter() + .all(|f| supported_type(f.data_type())) +} + +#[cfg(test)] +mod tests { + + #[test] + fn round_trip() { + assert!(true) + } +} From a919660c3213af8673281f40d1b8731a713b01af Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Sat, 5 Feb 2022 21:35:37 +0800 Subject: [PATCH 2/8] Write batch to row-wise page --- datafusion/src/row/mod.rs | 260 +++++++++++++++++++++++++++++++++++-- datafusion/src/row/page.rs | 227 ++++++++++++++++++++++++++++++++ 2 files changed, 475 insertions(+), 12 deletions(-) create mode 100644 datafusion/src/row/page.rs diff --git a/datafusion/src/row/mod.rs b/datafusion/src/row/mod.rs index edfe02274dd9..584ee92b5004 100644 --- a/datafusion/src/row/mod.rs +++ b/datafusion/src/row/mod.rs @@ -35,31 +35,41 @@ use arrow::datatypes::{DataType, Schema}; use std::sync::Arc; mod bitmap; +mod page; -struct Row<'a> { +const UTF8_DEFAULT_SIZE: usize = 20; +const BINARY_DEFAULT_SIZE: usize = 100; + +pub struct Row<'a> { data: &'a mut [u8], base_offset: usize, - length: usize, - schema: Arc, + field_count: usize, + row_width: usize, + pub(crate) fixed_size: bool, null_width: usize, values_width: usize, varlena_width: usize, + varlena_offset: usize, field_offsets: Vec, } impl<'a> Row<'a> { - pub fn new(schema: Arc) -> Self { + pub fn new(schema: &Arc) -> Self { assert!(supported(&schema)); - let null_width = null_width(schema.fields().len()); + let field_count = schema.fields().len(); + let null_width = null_width(field_count); let (field_offsets, values_width) = get_offsets(null_width, &schema); + let fixed_size = fixed_size(&schema); Self { data: &mut [], base_offset: 0, - length: 0, - schema, + field_count, + row_width: 0, + fixed_size, null_width, values_width, varlena_width: 0, + varlena_offset: null_width + values_width, field_offsets, } } @@ -68,11 +78,19 @@ impl<'a> Row<'a> { pub fn point_to(&mut self, base: &'a mut [u8], offset: usize) { self.data = base; self.base_offset = offset; + self.varlena_width = 0; + self.varlena_offset = self.null_width + self.values_width; + } + + pub fn new_from(schema: &Arc, base: &'a mut [u8], offset: usize) -> Self { + let mut row = Self::new(schema); + row.point_to(base, offset); + row } #[inline] fn assert_index_valid(&self, idx: usize) { - assert!(idx < self.schema.fields().len()); + assert!(idx < self.field_count); } // ---------------------- @@ -84,6 +102,104 @@ impl<'a> Row<'a> { get_bit(null_bits, idx) } + fn get_boolean(&self, idx: usize) -> bool { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + let value = &self.data[self.base_offset + offset..]; + value[0] != 0 + } + + fn get_u8(&self, idx: usize) -> u8 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[self.base_offset + offset] + } + + fn get_u16(&self, idx: usize) -> u16 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + u16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_u32(&self, idx: usize) -> u32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + u32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_u64(&self, idx: usize) -> u64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + u64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i8(&self, idx: usize) -> i8 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i8::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i16(&self, idx: usize) -> i16 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i32(&self, idx: usize) -> i32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i64(&self, idx: usize) -> i64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_f32(&self, idx: usize) -> f32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + f32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_f64(&self, idx: usize) -> f64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + f64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_date32(&self, idx: usize) -> i32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_date64(&self, idx: usize) -> i64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_utf8(&self, idx: usize) -> &str { + self.assert_index_valid(idx); + let offset_size = self.get_u64(idx); + let offset = (offset_size >> 32) as usize; + let len = (offset_size & 0xffff_ffff) as usize; + let varlena_offset = self.base_offset + offset; + let bytes = &self.data[varlena_offset..varlena_offset + len]; + std::str::from_utf8(bytes).unwrap() + } + + fn get_binary(&self, idx: usize) -> &[u8] { + self.assert_index_valid(idx); + let offset_size = self.get_u64(idx); + let offset = (offset_size >> 32) as usize; + let len = (offset_size & 0xffff_ffff) as usize; + let varlena_offset = self.base_offset + offset; + &self.data[varlena_offset..varlena_offset + len] + } + // ---------------------- // Mutators // ---------------------- @@ -100,10 +216,118 @@ impl<'a> Row<'a> { set_bit(null_bits, idx, true) } + fn set_boolean(&mut self, idx: usize, value: bool) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[self.base_offset + offset] = if value { 1 } else { 0 }; + } + + fn set_u8(&mut self, idx: usize, value: u8) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[self.base_offset + offset] = value; + } + + fn set_u16(&mut self, idx: usize, value: u16) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); + } + + fn set_u32(&mut self, idx: usize, value: u32) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_u64(&mut self, idx: usize, value: u64) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_i8(&mut self, idx: usize, value: i8) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset] = value.to_le_bytes()[0]; + } + + fn set_i16(&mut self, idx: usize, value: i16) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); + } + + fn set_i32(&mut self, idx: usize, value: i32) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_i64(&mut self, idx: usize, value: i64) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_f32(&mut self, idx: usize, value: f32) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_f64(&mut self, idx: usize, value: f64) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_date32(&mut self, idx: usize, value: i32) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_date64(&mut self, idx: usize, value: i64) { + self.assert_index_valid(idx); + let offset = self.base_offset + self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_offset_size(&mut self, idx: usize, size: usize) { + let offset_and_size: u64 = (self.varlena_offset << 32 | size) as u64; + self.set_u64(idx, offset_and_size); + } + + fn set_utf8(&mut self, idx: usize, value: &str) { + self.assert_index_valid(idx); + let bytes = value.as_bytes(); + let size = bytes.len(); + self.set_offset_size(idx, size); + let varlena_offset = self.base_offset + self.varlena_offset; + self.data[varlena_offset..varlena_offset + size].copy_from_slice(&bytes); + self.varlena_offset += size; + self.varlena_width += size; + } + + fn set_binary(&mut self, idx: usize, value: &[u8]) { + self.assert_index_valid(idx); + let size = value.len(); + self.set_offset_size(idx, size); + let varlena_offset = self.base_offset + self.varlena_offset; + self.data[varlena_offset..varlena_offset + size].copy_from_slice(&value); + self.varlena_offset += size; + self.varlena_width += size; + } + + pub fn current_width(&self) -> usize { + self.null_width + self.values_width + self.varlena_width + } + /// End each row at 8-byte word boundary. fn end_padding(&mut self) { - let payload_width = self.null_width + self.values_width + self.varlena_width; - self.length = (payload_width.saturating_add(7) / 8) * 8; + let payload_width = self.current_width(); + self.row_width = (payload_width.saturating_add(7) / 8) * 8; } } @@ -136,7 +360,6 @@ fn supported_type(dt: &DataType) -> bool { | Int16 | Int32 | Int64 - | Float16 | Float32 | Float64 | Date32 @@ -158,13 +381,26 @@ fn type_width(dt: &DataType) -> usize { } match dt { Boolean | UInt8 | Int8 => 1, - UInt16 | Int16 | Float16 => 2, + UInt16 | Int16 => 2, UInt32 | Int32 | Float32 | Date32 => 4, UInt64 | Int64 | Float64 | Date64 => 8, _ => unreachable!(), } } +fn estimate_row_width(schema: &Arc) -> usize { + let mut width = 0; + for f in schema.fields() { + width += type_width(f.data_type()); + match f.data_type() { + DataType::Utf8 => width += UTF8_DEFAULT_SIZE, + DataType::Binary => width += BINARY_DEFAULT_SIZE, + _ => {} + } + } + width +} + fn fixed_size(schema: &Arc) -> bool { schema.fields().iter().all(|f| !var_length(f.data_type())) } diff --git a/datafusion/src/row/page.rs b/datafusion/src/row/page.rs new file mode 100644 index 000000000000..6bd8290e024f --- /dev/null +++ b/datafusion/src/row/page.rs @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Page holds multiple row-wise tuples + +use crate::row::{supported, Row, BINARY_DEFAULT_SIZE, UTF8_DEFAULT_SIZE}; +use arrow::array::Array; +use arrow::datatypes::{DataType, Schema}; +use arrow::record_batch::RecordBatch; +use std::sync::Arc; + +use super::estimate_row_width; + +struct Page { + data: Vec, + capacity: usize, + available: usize, + /// The start offsets of each row in the page. + /// The last offset equals the current size of the page. + rows: Vec, + schema: Arc, +} + +impl Page { + fn new(capacity: usize, schema: Arc) -> Self { + assert!(supported(&schema)); + Self { + data: vec![0; capacity], + capacity, + available: 0, + rows: vec![0], + schema, + } + } + + /// Append batch from `row_idx` to Page and returns (is_page_full, next_row_to_write) + fn write_batch(&mut self, batch: &RecordBatch, row_idx: usize) -> (bool, usize) { + let mut row = Row::new_from(&self.schema, &mut self.data, self.available); + let estimate_row_width = estimate_row_width(&self.schema); + + if row.fixed_size { + for cur_row in row_idx..batch.num_rows() { + if !self.has_space(estimate_row_width) { + return (true, cur_row); + } + self.write_row_unchecked(&mut row, cur_row, batch); + row.point_to(&mut self.data, self.available); + } + } else { + for cur_row in row_idx..batch.num_rows() { + if !self.has_space(estimate_row_width) { + return (true, cur_row); + } + let success = self.write_row(&mut row, cur_row, batch); + if !success { + return (true, cur_row); + } + row.point_to(&mut self.data, self.available); + } + } + (false, batch.num_rows()) + } + + fn write_row(&mut self, row: &mut Row, row_idx: usize, batch: &RecordBatch) -> bool { + // Get the row from the batch denoted by row_idx + for ((i, f), col) in self + .schema + .fields() + .iter() + .enumerate() + .zip(batch.columns().iter()) + { + if !c.is_null(row_idx) { + if !self.write_field(i, row_idx, col, f.data_type(), row) { + return false; + } + row.set_non_null_at(i); + } else { + row.set_null_at(i); + } + } + + row.end_padding(); + self.available += row.row_width; + self.rows.push(self.available); + true + } + + fn write_row_unchecked( + &mut self, + row: &mut Row, + row_idx: usize, + batch: &RecordBatch, + ) { + // Get the row from the batch denoted by row_idx + for ((i, f), col) in self + .schema + .fields() + .iter() + .enumerate() + .zip(batch.columns().iter()) + { + if !c.is_null(row_idx) { + row.set_non_null_at(i); + self.write_field(i, row_idx, col, f.data_type(), row); + } else { + row.set_null_at(i); + } + } + + row.end_padding(); + self.available += row.row_width; + self.rows.push(self.available); + } + + fn write_field( + &mut self, + col_idx: usize, + row_idx: usize, + col: &Arc, + dt: &DataTypem, + row: &mut Row, + ) -> bool { + // TODO: JIT compile this + use arrow::array::*; + use DataType::*; + match dt { + Boolean => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_bool(col_idx, c.value(row_idx)); + } + UInt8 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u8(col_idx, c.value(row_idx)); + } + UInt16 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u16(col_idx, c.value(row_idx)); + } + UInt32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u32(col_idx, c.value(row_idx)); + } + UInt64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u64(col_idx, c.value(row_idx)); + } + Int8 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i8(col_idx, c.value(row_idx)); + } + Int16 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i16(col_idx, c.value(row_idx)); + } + Int32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i32(col_idx, c.value(row_idx)); + } + Int64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i64(col_idx, c.value(row_idx)); + } + Float32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_f32(col_idx, c.value(row_idx)); + } + Float64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_f64(col_idx, c.value(row_idx)); + } + Date32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_date32(col_idx, c.value(row_idx)); + } + Date64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_date64(col_idx, c.value(row_idx)); + } + Utf8 => { + let c = col.as_any().downcast_ref::().unwrap(); + let str = c.value(row_idx); + let len = str.as_bytes().len(); + if len > UTF8_DEFAULT_SIZE && self.has_space(len + row.current_width()) { + return false; + } + row.set_utf8(col_idx, str); + } + Binary => { + let c = col.as_any().downcast_ref::().unwrap(); + let binary = c.value(row_idx); + let len = binary.len(); + if len > BINARY_DEFAULT_SIZE && self.has_space(len + row.current_width()) + { + return false; + } + row.set_binary(col_idx, binary); + } + _ => unimplemented!(), + } + true + } + + fn get_row(&self, row_id: usize) -> &[u8] { + let row_offset = self.rows[row_id]; + &self.data[row_offset..row_offset + self.rows[row_id + 1] - row_offset] + } + + /// has enough space for a row of the given size + fn has_space(&self, row_size: usize) -> bool { + self.available + row_size <= self.capacity + } +} From fc5a11021e737cf1e09bd9d6437ce5772a634b56 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Mon, 7 Feb 2022 10:00:34 +0800 Subject: [PATCH 3/8] Row reader and writer basic functionalities --- Cargo.toml | 4 + datafusion/src/row/mod.rs | 297 +----------------------- datafusion/src/row/page.rs | 227 ------------------ datafusion/src/row/reader.rs | 433 +++++++++++++++++++++++++++++++++++ datafusion/src/row/writer.rs | 344 ++++++++++++++++++++++++++++ 5 files changed, 784 insertions(+), 521 deletions(-) delete mode 100644 datafusion/src/row/page.rs create mode 100644 datafusion/src/row/reader.rs create mode 100644 datafusion/src/row/writer.rs diff --git a/Cargo.toml b/Cargo.toml index ea1acc04e687..126fbf7db682 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,3 +31,7 @@ members = [ [profile.release] lto = true codegen-units = 1 + +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" } +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" } diff --git a/datafusion/src/row/mod.rs b/datafusion/src/row/mod.rs index 584ee92b5004..740853fea550 100644 --- a/datafusion/src/row/mod.rs +++ b/datafusion/src/row/mod.rs @@ -30,307 +30,16 @@ //! we append their actual content to the end of the var length region and //! store their offset relative to row base and their length, packed into an 8-byte word. -use crate::row::bitmap::{get_bit, set_bit}; use arrow::datatypes::{DataType, Schema}; use std::sync::Arc; mod bitmap; -mod page; +mod reader; +mod writer; const UTF8_DEFAULT_SIZE: usize = 20; const BINARY_DEFAULT_SIZE: usize = 100; -pub struct Row<'a> { - data: &'a mut [u8], - base_offset: usize, - field_count: usize, - row_width: usize, - pub(crate) fixed_size: bool, - null_width: usize, - values_width: usize, - varlena_width: usize, - varlena_offset: usize, - field_offsets: Vec, -} - -impl<'a> Row<'a> { - pub fn new(schema: &Arc) -> Self { - assert!(supported(&schema)); - let field_count = schema.fields().len(); - let null_width = null_width(field_count); - let (field_offsets, values_width) = get_offsets(null_width, &schema); - let fixed_size = fixed_size(&schema); - Self { - data: &mut [], - base_offset: 0, - field_count, - row_width: 0, - fixed_size, - null_width, - values_width, - varlena_width: 0, - varlena_offset: null_width + values_width, - field_offsets, - } - } - - /// Update this row to point to position `offset` in `base` - pub fn point_to(&mut self, base: &'a mut [u8], offset: usize) { - self.data = base; - self.base_offset = offset; - self.varlena_width = 0; - self.varlena_offset = self.null_width + self.values_width; - } - - pub fn new_from(schema: &Arc, base: &'a mut [u8], offset: usize) -> Self { - let mut row = Self::new(schema); - row.point_to(base, offset); - row - } - - #[inline] - fn assert_index_valid(&self, idx: usize) { - assert!(idx < self.field_count); - } - - // ---------------------- - // Accessors - // ---------------------- - - fn is_valid_at(&self, idx: usize) -> bool { - let null_bits = &self.data[self.base_offset..self.base_offset + self.null_width]; - get_bit(null_bits, idx) - } - - fn get_boolean(&self, idx: usize) -> bool { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - let value = &self.data[self.base_offset + offset..]; - value[0] != 0 - } - - fn get_u8(&self, idx: usize) -> u8 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[self.base_offset + offset] - } - - fn get_u16(&self, idx: usize) -> u16 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - u16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_u32(&self, idx: usize) -> u32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - u32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_u64(&self, idx: usize) -> u64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - u64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i8(&self, idx: usize) -> i8 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i8::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i16(&self, idx: usize) -> i16 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i32(&self, idx: usize) -> i32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i64(&self, idx: usize) -> i64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_f32(&self, idx: usize) -> f32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - f32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_f64(&self, idx: usize) -> f64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - f64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_date32(&self, idx: usize) -> i32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_date64(&self, idx: usize) -> i64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_utf8(&self, idx: usize) -> &str { - self.assert_index_valid(idx); - let offset_size = self.get_u64(idx); - let offset = (offset_size >> 32) as usize; - let len = (offset_size & 0xffff_ffff) as usize; - let varlena_offset = self.base_offset + offset; - let bytes = &self.data[varlena_offset..varlena_offset + len]; - std::str::from_utf8(bytes).unwrap() - } - - fn get_binary(&self, idx: usize) -> &[u8] { - self.assert_index_valid(idx); - let offset_size = self.get_u64(idx); - let offset = (offset_size >> 32) as usize; - let len = (offset_size & 0xffff_ffff) as usize; - let varlena_offset = self.base_offset + offset; - &self.data[varlena_offset..varlena_offset + len] - } - - // ---------------------- - // Mutators - // ---------------------- - - fn set_null_at(&mut self, idx: usize) { - let null_bits = - &mut self.data[self.base_offset..self.base_offset + self.null_width]; - set_bit(null_bits, idx, false) - } - - fn set_non_null_at(&mut self, idx: usize) { - let null_bits = - &mut self.data[self.base_offset..self.base_offset + self.null_width]; - set_bit(null_bits, idx, true) - } - - fn set_boolean(&mut self, idx: usize, value: bool) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[self.base_offset + offset] = if value { 1 } else { 0 }; - } - - fn set_u8(&mut self, idx: usize, value: u8) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[self.base_offset + offset] = value; - } - - fn set_u16(&mut self, idx: usize, value: u16) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); - } - - fn set_u32(&mut self, idx: usize, value: u32) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); - } - - fn set_u64(&mut self, idx: usize, value: u64) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); - } - - fn set_i8(&mut self, idx: usize, value: i8) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset] = value.to_le_bytes()[0]; - } - - fn set_i16(&mut self, idx: usize, value: i16) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); - } - - fn set_i32(&mut self, idx: usize, value: i32) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); - } - - fn set_i64(&mut self, idx: usize, value: i64) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); - } - - fn set_f32(&mut self, idx: usize, value: f32) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); - } - - fn set_f64(&mut self, idx: usize, value: f64) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); - } - - fn set_date32(&mut self, idx: usize, value: i32) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); - } - - fn set_date64(&mut self, idx: usize, value: i64) { - self.assert_index_valid(idx); - let offset = self.base_offset + self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); - } - - fn set_offset_size(&mut self, idx: usize, size: usize) { - let offset_and_size: u64 = (self.varlena_offset << 32 | size) as u64; - self.set_u64(idx, offset_and_size); - } - - fn set_utf8(&mut self, idx: usize, value: &str) { - self.assert_index_valid(idx); - let bytes = value.as_bytes(); - let size = bytes.len(); - self.set_offset_size(idx, size); - let varlena_offset = self.base_offset + self.varlena_offset; - self.data[varlena_offset..varlena_offset + size].copy_from_slice(&bytes); - self.varlena_offset += size; - self.varlena_width += size; - } - - fn set_binary(&mut self, idx: usize, value: &[u8]) { - self.assert_index_valid(idx); - let size = value.len(); - self.set_offset_size(idx, size); - let varlena_offset = self.base_offset + self.varlena_offset; - self.data[varlena_offset..varlena_offset + size].copy_from_slice(&value); - self.varlena_offset += size; - self.varlena_width += size; - } - - pub fn current_width(&self) -> usize { - self.null_width + self.values_width + self.varlena_width - } - - /// End each row at 8-byte word boundary. - fn end_padding(&mut self) { - let payload_width = self.current_width(); - self.row_width = (payload_width.saturating_add(7) / 8) * 8; - } -} - /// Get number of bytes needed for null bit set fn null_width(num_fields: usize) -> usize { num_fields.saturating_add(7) / 8 @@ -398,7 +107,7 @@ fn estimate_row_width(schema: &Arc) -> usize { _ => {} } } - width + (width.saturating_add(7) / 8) * 8 } fn fixed_size(schema: &Arc) -> bool { diff --git a/datafusion/src/row/page.rs b/datafusion/src/row/page.rs deleted file mode 100644 index 6bd8290e024f..000000000000 --- a/datafusion/src/row/page.rs +++ /dev/null @@ -1,227 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Page holds multiple row-wise tuples - -use crate::row::{supported, Row, BINARY_DEFAULT_SIZE, UTF8_DEFAULT_SIZE}; -use arrow::array::Array; -use arrow::datatypes::{DataType, Schema}; -use arrow::record_batch::RecordBatch; -use std::sync::Arc; - -use super::estimate_row_width; - -struct Page { - data: Vec, - capacity: usize, - available: usize, - /// The start offsets of each row in the page. - /// The last offset equals the current size of the page. - rows: Vec, - schema: Arc, -} - -impl Page { - fn new(capacity: usize, schema: Arc) -> Self { - assert!(supported(&schema)); - Self { - data: vec![0; capacity], - capacity, - available: 0, - rows: vec![0], - schema, - } - } - - /// Append batch from `row_idx` to Page and returns (is_page_full, next_row_to_write) - fn write_batch(&mut self, batch: &RecordBatch, row_idx: usize) -> (bool, usize) { - let mut row = Row::new_from(&self.schema, &mut self.data, self.available); - let estimate_row_width = estimate_row_width(&self.schema); - - if row.fixed_size { - for cur_row in row_idx..batch.num_rows() { - if !self.has_space(estimate_row_width) { - return (true, cur_row); - } - self.write_row_unchecked(&mut row, cur_row, batch); - row.point_to(&mut self.data, self.available); - } - } else { - for cur_row in row_idx..batch.num_rows() { - if !self.has_space(estimate_row_width) { - return (true, cur_row); - } - let success = self.write_row(&mut row, cur_row, batch); - if !success { - return (true, cur_row); - } - row.point_to(&mut self.data, self.available); - } - } - (false, batch.num_rows()) - } - - fn write_row(&mut self, row: &mut Row, row_idx: usize, batch: &RecordBatch) -> bool { - // Get the row from the batch denoted by row_idx - for ((i, f), col) in self - .schema - .fields() - .iter() - .enumerate() - .zip(batch.columns().iter()) - { - if !c.is_null(row_idx) { - if !self.write_field(i, row_idx, col, f.data_type(), row) { - return false; - } - row.set_non_null_at(i); - } else { - row.set_null_at(i); - } - } - - row.end_padding(); - self.available += row.row_width; - self.rows.push(self.available); - true - } - - fn write_row_unchecked( - &mut self, - row: &mut Row, - row_idx: usize, - batch: &RecordBatch, - ) { - // Get the row from the batch denoted by row_idx - for ((i, f), col) in self - .schema - .fields() - .iter() - .enumerate() - .zip(batch.columns().iter()) - { - if !c.is_null(row_idx) { - row.set_non_null_at(i); - self.write_field(i, row_idx, col, f.data_type(), row); - } else { - row.set_null_at(i); - } - } - - row.end_padding(); - self.available += row.row_width; - self.rows.push(self.available); - } - - fn write_field( - &mut self, - col_idx: usize, - row_idx: usize, - col: &Arc, - dt: &DataTypem, - row: &mut Row, - ) -> bool { - // TODO: JIT compile this - use arrow::array::*; - use DataType::*; - match dt { - Boolean => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_bool(col_idx, c.value(row_idx)); - } - UInt8 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_u8(col_idx, c.value(row_idx)); - } - UInt16 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_u16(col_idx, c.value(row_idx)); - } - UInt32 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_u32(col_idx, c.value(row_idx)); - } - UInt64 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_u64(col_idx, c.value(row_idx)); - } - Int8 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_i8(col_idx, c.value(row_idx)); - } - Int16 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_i16(col_idx, c.value(row_idx)); - } - Int32 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_i32(col_idx, c.value(row_idx)); - } - Int64 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_i64(col_idx, c.value(row_idx)); - } - Float32 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_f32(col_idx, c.value(row_idx)); - } - Float64 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_f64(col_idx, c.value(row_idx)); - } - Date32 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_date32(col_idx, c.value(row_idx)); - } - Date64 => { - let c = col.as_any().downcast_ref::().unwrap(); - row.set_date64(col_idx, c.value(row_idx)); - } - Utf8 => { - let c = col.as_any().downcast_ref::().unwrap(); - let str = c.value(row_idx); - let len = str.as_bytes().len(); - if len > UTF8_DEFAULT_SIZE && self.has_space(len + row.current_width()) { - return false; - } - row.set_utf8(col_idx, str); - } - Binary => { - let c = col.as_any().downcast_ref::().unwrap(); - let binary = c.value(row_idx); - let len = binary.len(); - if len > BINARY_DEFAULT_SIZE && self.has_space(len + row.current_width()) - { - return false; - } - row.set_binary(col_idx, binary); - } - _ => unimplemented!(), - } - true - } - - fn get_row(&self, row_id: usize) -> &[u8] { - let row_offset = self.rows[row_id]; - &self.data[row_offset..row_offset + self.rows[row_id + 1] - row_offset] - } - - /// has enough space for a row of the given size - fn has_space(&self, row_size: usize) -> bool { - self.available + row_size <= self.capacity - } -} diff --git a/datafusion/src/row/reader.rs b/datafusion/src/row/reader.rs new file mode 100644 index 000000000000..d9db4cf5259a --- /dev/null +++ b/datafusion/src/row/reader.rs @@ -0,0 +1,433 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Accessing row from raw bytes + +use crate::error::{DataFusionError, Result}; +use crate::row::bitmap::get_bit; +use crate::row::{get_offsets, null_width, supported}; +use arrow::array::{make_builder, Array, ArrayBuilder}; +use arrow::datatypes::{DataType, Schema}; +use arrow::error::Result as ArrowResult; +use arrow::record_batch::RecordBatch; +use std::sync::Arc; + +/// Read `data` of raw-bytes rows starting at `offsets` out to a record batch +pub fn read_as_batch( + data: &mut [u8], + schema: Arc, + offsets: Vec, +) -> Result { + let row_num = offsets.len(); + let mut output = MutableRecordBatch::new(row_num, schema.clone()); + let mut row = RowReader::new(&schema, data); + + for i in 0..row_num { + row.point_to(offsets[i]); + read_row(&row, &mut output, &schema)? + } + + output.output().map_err(DataFusionError::ArrowError) +} + +struct RowReader<'a> { + data: &'a [u8], + base_offset: usize, + field_count: usize, + field_offsets: Vec, +} + +impl<'a> RowReader<'a> { + fn new(schema: &Arc, data: &'a [u8]) -> Self { + assert!(supported(&schema)); + let field_count = schema.fields().len(); + let null_width = null_width(field_count); + let (field_offsets, _) = get_offsets(null_width, &schema); + Self { + data, + base_offset: 0, + field_count, + field_offsets, + } + } + + /// Update this row to point to position `offset` in `base` + fn point_to(&mut self, offset: usize) { + self.base_offset = offset; + } + + #[inline] + fn assert_index_valid(&self, idx: usize) { + assert!(idx < self.field_count); + } + + // ---------------------- + // Accessors + // ---------------------- + + fn is_valid_at(&self, idx: usize) -> bool { + get_bit(&self.data, idx) + } + + fn get_bool(&self, idx: usize) -> bool { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + let value = &self.data[self.base_offset + offset..]; + value[0] != 0 + } + + fn get_u8(&self, idx: usize) -> u8 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[self.base_offset + offset] + } + + fn get_u16(&self, idx: usize) -> u16 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + u16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_u32(&self, idx: usize) -> u32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + u32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_u64(&self, idx: usize) -> u64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + u64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i8(&self, idx: usize) -> i8 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i8::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i16(&self, idx: usize) -> i16 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i32(&self, idx: usize) -> i32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_i64(&self, idx: usize) -> i64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_f32(&self, idx: usize) -> f32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + f32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_f64(&self, idx: usize) -> f64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + f64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_date32(&self, idx: usize) -> i32 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_date64(&self, idx: usize) -> i64 { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + } + + fn get_utf8(&self, idx: usize) -> &str { + self.assert_index_valid(idx); + let offset_size = self.get_u64(idx); + let offset = (offset_size >> 32) as usize; + let len = (offset_size & 0xffff_ffff) as usize; + let varlena_offset = self.base_offset + offset; + let bytes = &self.data[varlena_offset..varlena_offset + len]; + std::str::from_utf8(bytes).unwrap() + } + + fn get_binary(&self, idx: usize) -> &[u8] { + self.assert_index_valid(idx); + let offset_size = self.get_u64(idx); + let offset = (offset_size >> 32) as usize; + let len = (offset_size & 0xffff_ffff) as usize; + let varlena_offset = self.base_offset + offset; + &self.data[varlena_offset..varlena_offset + len] + } + + fn get_bool_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_bool(idx)) + } else { + None + } + } + + fn get_u8_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_u8(idx)) + } else { + None + } + } + + fn get_u16_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_u16(idx)) + } else { + None + } + } + + fn get_u32_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_u32(idx)) + } else { + None + } + } + + fn get_u64_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_u64(idx)) + } else { + None + } + } + + fn get_i8_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_i8(idx)) + } else { + None + } + } + + fn get_i16_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_i16(idx)) + } else { + None + } + } + + fn get_i32_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_i32(idx)) + } else { + None + } + } + + fn get_i64_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_i64(idx)) + } else { + None + } + } + + fn get_f32_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_f32(idx)) + } else { + None + } + } + + fn get_f64_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_f64(idx)) + } else { + None + } + } + + fn get_date32_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_date32(idx)) + } else { + None + } + } + + fn get_date64_opt(&self, idx: usize) -> Option { + if self.is_valid_at(idx) { + Some(self.get_date64(idx)) + } else { + None + } + } + + fn get_utf8_opt(&self, idx: usize) -> Option<&str> { + if self.is_valid_at(idx) { + Some(self.get_utf8(idx)) + } else { + None + } + } + + fn get_binary_opt(&self, idx: usize) -> Option<&[u8]> { + if self.is_valid_at(idx) { + Some(self.get_binary(idx)) + } else { + None + } + } +} + +fn read_row( + row: &RowReader, + batch: &mut MutableRecordBatch, + schema: &Arc, +) -> Result<()> { + for ((col_idx, to), field) in batch + .arrays + .iter_mut() + .enumerate() + .zip(schema.fields().iter()) + { + read_field(to, field.data_type(), col_idx, row)? + } + Ok(()) +} + +fn read_field( + to: &mut Box, + dt: &DataType, + col_idx: usize, + row: &RowReader, +) -> Result<()> { + use arrow::array::*; + use DataType::*; + match dt { + Boolean => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_bool_opt(col_idx))?; + } + UInt8 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_u8_opt(col_idx))?; + } + UInt16 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_u16_opt(col_idx))?; + } + UInt32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_u32_opt(col_idx))?; + } + UInt64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_u64_opt(col_idx))?; + } + Int8 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_i8_opt(col_idx))?; + } + Int16 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_i16_opt(col_idx))?; + } + Int32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_i32_opt(col_idx))?; + } + Int64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_i64_opt(col_idx))?; + } + Float32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_f32_opt(col_idx))?; + } + Float64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_f64_opt(col_idx))?; + } + Date32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_date32_opt(col_idx))?; + } + Date64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_date64_opt(col_idx))?; + } + Utf8 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_option(row.get_utf8_opt(col_idx))?; + } + Binary => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + if row.is_valid_at(col_idx) { + to.append_value(row.get_binary(col_idx))?; + } else { + to.append_null()?; + } + } + _ => unimplemented!(), + } + Ok(()) +} + +struct MutableRecordBatch { + arrays: Vec>, + schema: Arc, +} + +impl MutableRecordBatch { + fn new(target_batch_size: usize, schema: Arc) -> Self { + let arrays = new_arrays(&schema, target_batch_size); + Self { arrays, schema } + } + + fn output(&mut self) -> ArrowResult { + let result = make_batch(self.schema.clone(), self.arrays.drain(..).collect()); + result + } +} + +fn new_arrays(schema: &Arc, batch_size: usize) -> Vec> { + schema + .fields() + .iter() + .map(|field| { + let dt = field.data_type(); + make_builder(dt, batch_size) + }) + .collect::>() +} + +fn make_batch( + schema: Arc, + mut arrays: Vec>, +) -> ArrowResult { + let columns = arrays.iter_mut().map(|array| array.finish()).collect(); + RecordBatch::try_new(schema, columns) +} diff --git a/datafusion/src/row/writer.rs b/datafusion/src/row/writer.rs new file mode 100644 index 000000000000..394d17839ac9 --- /dev/null +++ b/datafusion/src/row/writer.rs @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Reusable row writer backed by Vec to stitch attributes together + +use crate::row::bitmap::{get_bit, set_bit}; +use crate::row::{estimate_row_width, fixed_size, get_offsets, null_width, supported}; +use arrow::array::Array; +use arrow::datatypes::{DataType, Schema}; +use arrow::record_batch::RecordBatch; +use std::cmp::max; +use std::sync::Arc; + +/// Append batch from `row_idx` to `output` buffer start from `offset` +/// # Panics +/// +/// This function will panic if the output buffer doesn't have enough space to hold all the rows +pub fn write_batch_unchecked( + output: &mut [u8], + offset: usize, + batch: &RecordBatch, + row_idx: usize, + schema: Arc, +) -> Vec { + let mut writer = RowWriter::new(&schema); + let mut current_offset = offset; + let mut offsets = vec![]; + for cur_row in row_idx..batch.num_rows() { + offsets.push(current_offset); + let row_width = write_row(&mut writer, cur_row, batch, &schema); + output[current_offset..current_offset + row_width] + .copy_from_slice(writer.get_row()); + current_offset += row_width; + } + offsets +} + +/// Reusable row writer backed by Vec +pub struct RowWriter { + data: Vec, + field_count: usize, + row_width: usize, + null_width: usize, + values_width: usize, + varlena_width: usize, + varlena_offset: usize, + field_offsets: Vec, +} + +impl RowWriter { + /// new + pub fn new(schema: &Arc) -> Self { + assert!(supported(&schema)); + let field_count = schema.fields().len(); + let null_width = null_width(field_count); + let (field_offsets, values_width) = get_offsets(null_width, &schema); + let mut init_capacity = estimate_row_width(&schema); + if !fixed_size(&schema) { + // double the capacity to avoid repeated resize + init_capacity *= 2; + } + Self { + data: vec![0; init_capacity], + field_count, + row_width: 0, + null_width, + values_width, + varlena_width: 0, + varlena_offset: null_width + values_width, + field_offsets, + } + } + + /// Reset the row writer state for new tuple + pub fn reset(&mut self) { + self.data.fill(0); + self.row_width = 0; + self.varlena_width = 0; + self.varlena_offset = self.null_width + self.values_width; + } + + #[inline] + fn assert_index_valid(&self, idx: usize) { + assert!(idx < self.field_count); + } + + fn set_null_at(&mut self, idx: usize) { + let null_bits = &mut self.data[0..self.null_width]; + set_bit(null_bits, idx, false) + } + + fn set_non_null_at(&mut self, idx: usize) { + let null_bits = &mut self.data[0..self.null_width]; + set_bit(null_bits, idx, true) + } + + fn set_bool(&mut self, idx: usize, value: bool) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset] = if value { 1 } else { 0 }; + } + + fn set_u8(&mut self, idx: usize, value: u8) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset] = value; + } + + fn set_u16(&mut self, idx: usize, value: u16) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); + } + + fn set_u32(&mut self, idx: usize, value: u32) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_u64(&mut self, idx: usize, value: u64) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_i8(&mut self, idx: usize, value: i8) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset] = value.to_le_bytes()[0]; + } + + fn set_i16(&mut self, idx: usize, value: i16) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); + } + + fn set_i32(&mut self, idx: usize, value: i32) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_i64(&mut self, idx: usize, value: i64) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_f32(&mut self, idx: usize, value: f32) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_f64(&mut self, idx: usize, value: f64) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_date32(&mut self, idx: usize, value: i32) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + } + + fn set_date64(&mut self, idx: usize, value: i64) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + } + + fn set_offset_size(&mut self, idx: usize, size: usize) { + let offset_and_size: u64 = (self.varlena_offset << 32 | size) as u64; + self.set_u64(idx, offset_and_size); + } + + fn set_utf8(&mut self, idx: usize, value: &str) { + self.assert_index_valid(idx); + let bytes = value.as_bytes(); + let size = bytes.len(); + self.set_offset_size(idx, size); + let varlena_offset = self.varlena_offset; + self.data[varlena_offset..varlena_offset + size].copy_from_slice(&bytes); + self.varlena_offset += size; + self.varlena_width += size; + } + + fn set_binary(&mut self, idx: usize, value: &[u8]) { + self.assert_index_valid(idx); + let size = value.len(); + self.set_offset_size(idx, size); + let varlena_offset = self.varlena_offset; + self.data[varlena_offset..varlena_offset + size].copy_from_slice(&value); + self.varlena_offset += size; + self.varlena_width += size; + } + + fn current_width(&self) -> usize { + self.null_width + self.values_width + self.varlena_width + } + + /// End each row at 8-byte word boundary. + fn end_padding(&mut self) { + let payload_width = self.current_width(); + self.row_width = (payload_width.saturating_add(7) / 8) * 8; + if self.data.capacity() < self.row_width { + self.data.resize(self.row_width, 0); + } + } + + fn get_row(&self) -> &[u8] { + &self.data[0..self.row_width] + } +} + +/// Stitch attributes of tuple in `batch` at `row_idx` and returns the tuple width +fn write_row( + row: &mut RowWriter, + row_idx: usize, + batch: &RecordBatch, + schema: &Arc, +) -> usize { + // Get the row from the batch denoted by row_idx + for ((i, f), col) in schema + .fields() + .iter() + .enumerate() + .zip(batch.columns().iter()) + { + if !col.is_null(row_idx) { + row.set_non_null_at(i); + write_field(i, row_idx, col, f.data_type(), row); + } else { + row.set_null_at(i); + } + } + + row.end_padding(); + row.row_width +} + +fn write_field( + col_idx: usize, + row_idx: usize, + col: &Arc, + dt: &DataType, + row: &mut RowWriter, +) { + // TODO: JIT compile this + use arrow::array::*; + use DataType::*; + match dt { + Boolean => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_bool(col_idx, c.value(row_idx)); + } + UInt8 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u8(col_idx, c.value(row_idx)); + } + UInt16 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u16(col_idx, c.value(row_idx)); + } + UInt32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u32(col_idx, c.value(row_idx)); + } + UInt64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_u64(col_idx, c.value(row_idx)); + } + Int8 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i8(col_idx, c.value(row_idx)); + } + Int16 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i16(col_idx, c.value(row_idx)); + } + Int32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i32(col_idx, c.value(row_idx)); + } + Int64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_i64(col_idx, c.value(row_idx)); + } + Float32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_f32(col_idx, c.value(row_idx)); + } + Float64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_f64(col_idx, c.value(row_idx)); + } + Date32 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_date32(col_idx, c.value(row_idx)); + } + Date64 => { + let c = col.as_any().downcast_ref::().unwrap(); + row.set_date64(col_idx, c.value(row_idx)); + } + Utf8 => { + let c = col.as_any().downcast_ref::().unwrap(); + let str = c.value(row_idx); + let new_width = row.current_width() + str.as_bytes().len(); + if new_width > row.data.capacity() { + // double the capacity to avoid repeated resize + row.data.resize(max(row.data.capacity() * 2, new_width), 0); + } + row.set_utf8(col_idx, str); + } + Binary => { + let c = col.as_any().downcast_ref::().unwrap(); + let binary = c.value(row_idx); + let new_width = row.current_width() + binary.len(); + if new_width > row.data.capacity() { + // double the capacity to avoid repeated resize + row.data.resize(max(row.data.capacity() * 2, new_width), 0); + } + row.set_binary(col_idx, binary); + } + _ => unimplemented!(), + } +} From 0cb5a77adaf07267099d2fc08d544f59f0aaeb4a Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Tue, 8 Feb 2022 00:19:03 +0800 Subject: [PATCH 4/8] DRY with paste and macro --- datafusion/src/row/reader.rs | 206 ++++++++++------------------------- datafusion/src/row/writer.rs | 85 ++++++--------- 2 files changed, 91 insertions(+), 200 deletions(-) diff --git a/datafusion/src/row/reader.rs b/datafusion/src/row/reader.rs index d9db4cf5259a..23556d07d186 100644 --- a/datafusion/src/row/reader.rs +++ b/datafusion/src/row/reader.rs @@ -44,6 +44,44 @@ pub fn read_as_batch( output.output().map_err(DataFusionError::ArrowError) } +macro_rules! get_idx { + ($NATIVE: ident, $SELF: ident, $IDX: ident) => {{ + $SELF.assert_index_valid($IDX); + let offset = $SELF.field_offsets[$IDX]; + $NATIVE::from_le_bytes( + $SELF.data[$SELF.base_offset + offset..].try_into().unwrap(), + ) + }}; +} + +macro_rules! fn_get_idx { + ($NATIVE: ident) => { + paste::item! { + fn [](&self, idx: usize) -> $NATIVE { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + $NATIVE::from_le_bytes( + self.data[self.base_offset + offset..].try_into().unwrap(), + ) + } + } + }; +} + +macro_rules! fn_get_idx_opt { + ($NATIVE: ident) => { + paste::item! { + fn [](&self, idx: usize) -> Option<$NATIVE> { + if self.is_valid_at(idx) { + Some(self.[](idx)) + } else { + None + } + } + } + }; +} + struct RowReader<'a> { data: &'a [u8], base_offset: usize, @@ -96,70 +134,22 @@ impl<'a> RowReader<'a> { self.data[self.base_offset + offset] } - fn get_u16(&self, idx: usize) -> u16 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - u16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_u32(&self, idx: usize) -> u32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - u32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_u64(&self, idx: usize) -> u64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - u64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i8(&self, idx: usize) -> i8 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i8::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i16(&self, idx: usize) -> i16 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i16::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i32(&self, idx: usize) -> i32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_i64(&self, idx: usize) -> i64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_f32(&self, idx: usize) -> f32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - f32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } - - fn get_f64(&self, idx: usize) -> f64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - f64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) - } + fn_get_idx!(u16); + fn_get_idx!(u32); + fn_get_idx!(u64); + fn_get_idx!(i8); + fn_get_idx!(i16); + fn_get_idx!(i32); + fn_get_idx!(i64); + fn_get_idx!(f32); + fn_get_idx!(f64); fn get_date32(&self, idx: usize) -> i32 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i32::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + get_idx!(i32, self, idx) } fn get_date64(&self, idx: usize) -> i64 { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - i64::from_le_bytes(self.data[self.base_offset + offset..].try_into().unwrap()) + get_idx!(i64, self, idx) } fn get_utf8(&self, idx: usize) -> &str { @@ -181,93 +171,17 @@ impl<'a> RowReader<'a> { &self.data[varlena_offset..varlena_offset + len] } - fn get_bool_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_bool(idx)) - } else { - None - } - } - - fn get_u8_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_u8(idx)) - } else { - None - } - } - - fn get_u16_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_u16(idx)) - } else { - None - } - } - - fn get_u32_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_u32(idx)) - } else { - None - } - } - - fn get_u64_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_u64(idx)) - } else { - None - } - } - - fn get_i8_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_i8(idx)) - } else { - None - } - } - - fn get_i16_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_i16(idx)) - } else { - None - } - } - - fn get_i32_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_i32(idx)) - } else { - None - } - } - - fn get_i64_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_i64(idx)) - } else { - None - } - } - - fn get_f32_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_f32(idx)) - } else { - None - } - } - - fn get_f64_opt(&self, idx: usize) -> Option { - if self.is_valid_at(idx) { - Some(self.get_f64(idx)) - } else { - None - } - } + fn_get_idx_opt!(bool); + fn_get_idx_opt!(u8); + fn_get_idx_opt!(u16); + fn_get_idx_opt!(u32); + fn_get_idx_opt!(u64); + fn_get_idx_opt!(i8); + fn_get_idx_opt!(i16); + fn_get_idx_opt!(i32); + fn_get_idx_opt!(i64); + fn_get_idx_opt!(f32); + fn_get_idx_opt!(f64); fn get_date32_opt(&self, idx: usize) -> Option { if self.is_valid_at(idx) { diff --git a/datafusion/src/row/writer.rs b/datafusion/src/row/writer.rs index 394d17839ac9..be36a03084cd 100644 --- a/datafusion/src/row/writer.rs +++ b/datafusion/src/row/writer.rs @@ -17,7 +17,7 @@ //! Reusable row writer backed by Vec to stitch attributes together -use crate::row::bitmap::{get_bit, set_bit}; +use crate::row::bitmap::set_bit; use crate::row::{estimate_row_width, fixed_size, get_offsets, null_width, supported}; use arrow::array::Array; use arrow::datatypes::{DataType, Schema}; @@ -49,6 +49,26 @@ pub fn write_batch_unchecked( offsets } +macro_rules! set_idx { + ($WIDTH: literal, $SELF: ident, $IDX: ident, $VALUE: ident) => {{ + $SELF.assert_index_valid($IDX); + let offset = $SELF.field_offsets[$IDX]; + $SELF.data[offset..offset + $WIDTH].copy_from_slice(&$VALUE.to_le_bytes()); + }}; +} + +macro_rules! fn_set_idx { + ($NATIVE: ident, $WIDTH: literal) => { + paste::item! { + fn [](&mut self, idx: usize, value: $NATIVE) { + self.assert_index_valid(idx); + let offset = self.field_offsets[idx]; + self.data[offset..offset + $WIDTH].copy_from_slice(&value.to_le_bytes()); + } + } + }; +} + /// Reusable row writer backed by Vec pub struct RowWriter { data: Vec, @@ -120,23 +140,14 @@ impl RowWriter { self.data[offset] = value; } - fn set_u16(&mut self, idx: usize, value: u16) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); - } - - fn set_u32(&mut self, idx: usize, value: u32) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); - } - - fn set_u64(&mut self, idx: usize, value: u64) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); - } + fn_set_idx!(u16, 2); + fn_set_idx!(u32, 4); + fn_set_idx!(u64, 8); + fn_set_idx!(i16, 2); + fn_set_idx!(i32, 4); + fn_set_idx!(i64, 8); + fn_set_idx!(f32, 4); + fn_set_idx!(f64, 8); fn set_i8(&mut self, idx: usize, value: i8) { self.assert_index_valid(idx); @@ -144,46 +155,12 @@ impl RowWriter { self.data[offset] = value.to_le_bytes()[0]; } - fn set_i16(&mut self, idx: usize, value: i16) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes()); - } - - fn set_i32(&mut self, idx: usize, value: i32) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); - } - - fn set_i64(&mut self, idx: usize, value: i64) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); - } - - fn set_f32(&mut self, idx: usize, value: f32) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); - } - - fn set_f64(&mut self, idx: usize, value: f64) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); - } - fn set_date32(&mut self, idx: usize, value: i32) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes()); + set_idx!(4, self, idx, value) } fn set_date64(&mut self, idx: usize, value: i64) { - self.assert_index_valid(idx); - let offset = self.field_offsets[idx]; - self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes()); + set_idx!(8, self, idx, value) } fn set_offset_size(&mut self, idx: usize, size: usize) { From b58058b42fcb7d81ac1853b0232f9cd49d413fda Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Tue, 8 Feb 2022 17:54:37 +0800 Subject: [PATCH 5/8] Test cases --- datafusion/src/lib.rs | 1 + datafusion/src/row/bitmap/mod.rs | 107 ++++++++---------- datafusion/src/row/mod.rs | 121 ++++++++++++++++++-- datafusion/src/row/reader.rs | 186 +++++++++++++++++++++++-------- datafusion/src/row/writer.rs | 19 ++-- 5 files changed, 311 insertions(+), 123 deletions(-) diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index 1d5ec13d2035..a0f8279f709c 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -223,6 +223,7 @@ pub use arrow; pub use parquet; pub(crate) mod field_util; +#[allow(dead_code)] pub(crate) mod row; #[cfg(feature = "pyarrow")] diff --git a/datafusion/src/row/bitmap/mod.rs b/datafusion/src/row/bitmap/mod.rs index 13c6e09aaaaa..79a21f334bab 100644 --- a/datafusion/src/row/bitmap/mod.rs +++ b/datafusion/src/row/bitmap/mod.rs @@ -21,6 +21,8 @@ mod fmt; +pub use fmt::fmt; + const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; const UNSET_BIT_MASK: [u8; 8] = [ 255 - 1, @@ -32,6 +34,7 @@ const UNSET_BIT_MASK: [u8; 8] = [ 255 - 64, 255 - 128, ]; +const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255]; /// Returns whether bit at position `i` in `byte` is set or not #[inline] @@ -55,12 +58,6 @@ pub fn set_bit(data: &mut [u8], i: usize, value: bool) { data[i / 8] = set(data[i / 8], i % 8, value); } -/// Returns whether bit at position `i` in `data` is set or not -#[inline] -pub fn get_bit(data: &[u8], i: usize) -> bool { - is_set(data[i / 8], i % 8) -} - /// Returns whether bit at position `i` in `data` is set or not. /// /// # Safety @@ -76,60 +73,54 @@ pub fn bytes_for(bits: usize) -> usize { bits.saturating_add(7) / 8 } -/// Returns the number of zero bits in the slice offsetted by `offset` and a length of `length`. -/// # Panics -/// This function panics iff `(offset + len).saturating_add(7) / 8 >= slice.len()` -/// because it corresponds to the situation where `len` is beyond bounds. -pub fn count_zeros(slice: &[u8], offset: usize, len: usize) -> usize { - if len == 0 { - return 0; - }; - - let mut slice = &slice[offset / 8..(offset + len).saturating_add(7) / 8]; - let offset = offset % 8; - - if (offset + len) / 8 == 0 { - // all within a single byte - let byte = (slice[0] >> offset) << (8 - len); - return len - byte.count_ones() as usize; +/// Returns if all fields are valid +pub fn all_valid(data: &[u8], n: usize) -> bool { + for item in data.iter().take(n / 8) { + if *item != ALL_VALID_MASK[7] { + return false; + } } - - // slice: [a1,a2,a3,a4], [a5,a6,a7,a8] - // offset: 3 - // len: 4 - // [__,__,__,a4], [a5,a6,a7,__] - let mut set_count = 0; - if offset != 0 { - // count all ignoring the first `offset` bits - // i.e. [__,__,__,a4] - set_count += (slice[0] >> offset).count_ones() as usize; - slice = &slice[1..]; - } - if (offset + len) % 8 != 0 { - let end_offset = (offset + len) % 8; // i.e. 3 + 4 = 7 - let last_index = slice.len() - 1; - // count all ignoring the last `offset` bits - // i.e. [a5,a6,a7,__] - set_count += (slice[last_index] << (8 - end_offset)).count_ones() as usize; - slice = &slice[..last_index]; + if n % 8 == 0 { + true + } else { + data[n / 8] == ALL_VALID_MASK[n % 8 - 1] } +} - // finally, count any and all bytes in the middle in groups of 8 - let mut chunks = slice.chunks_exact(8); - set_count += chunks - .by_ref() - .map(|chunk| { - let a = u64::from_ne_bytes(chunk.try_into().unwrap()); - a.count_ones() as usize - }) - .sum::(); - - // and any bytes that do not fit in the group - set_count += chunks - .remainder() - .iter() - .map(|byte| byte.count_ones() as usize) - .sum::(); +#[cfg(test)] +mod tests { + use super::*; + use rand::Rng; + + fn test_validity(bs: &[bool]) { + let mut data = vec![0; bytes_for(bs.len())]; + for (i, b) in bs.iter().enumerate() { + set_bit(&mut data, i, *b); + } + let expected = bs.iter().all(|f| *f); + assert_eq!(all_valid(&data, bs.len()), expected); + } - len - set_count + #[test] + fn test_all_valid() { + let sizes = [4, 8, 12, 16, 19, 23, 32, 44]; + for i in sizes { + { + // contains false + let input = { + let mut rng = rand::thread_rng(); + let mut input: Vec = vec![false; i]; + rng.fill(&mut input[..]); + input + }; + test_validity(&input); + } + + { + // all true + let input = vec![true; i]; + test_validity(&input); + } + } + } } diff --git a/datafusion/src/row/mod.rs b/datafusion/src/row/mod.rs index 740853fea550..340511ae7062 100644 --- a/datafusion/src/row/mod.rs +++ b/datafusion/src/row/mod.rs @@ -40,11 +40,6 @@ mod writer; const UTF8_DEFAULT_SIZE: usize = 20; const BINARY_DEFAULT_SIZE: usize = 100; -/// Get number of bytes needed for null bit set -fn null_width(num_fields: usize) -> usize { - num_fields.saturating_add(7) / 8 -} - /// Get relative offsets for each field and total width for values fn get_offsets(null_width: usize, schema: &Arc) -> (Vec, usize) { let mut offsets = vec![]; @@ -97,8 +92,8 @@ fn type_width(dt: &DataType) -> usize { } } -fn estimate_row_width(schema: &Arc) -> usize { - let mut width = 0; +fn estimate_row_width(null_width: usize, schema: &Arc) -> usize { + let mut width = null_width; for f in schema.fields() { width += type_width(f.data_type()); match f.data_type() { @@ -123,9 +118,115 @@ fn supported(schema: &Arc) -> bool { #[cfg(test)] mod tests { + use super::*; + use crate::error::Result; + use crate::row::reader::read_as_batch; + use crate::row::writer::write_batch_unchecked; + use arrow::record_batch::RecordBatch; + use arrow::{array::*, datatypes::*}; + use DataType::*; - #[test] - fn round_trip() { - assert!(true) + macro_rules! fn_test_single_type { + ($ARRAY: ident, $TYPE: expr, $VEC: expr) => { + paste::item! { + #[test] + #[allow(non_snake_case)] + fn []() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("a", $TYPE, false)])); + let a = $ARRAY::from($VEC); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(a)])?; + let mut vector = vec![0; 1024]; + let row_offsets = + { write_batch_unchecked(&mut vector, 0, &batch, 0, schema.clone()) }; + let output_batch = { read_as_batch(&mut vector, schema, row_offsets)? }; + assert_eq!(batch, output_batch); + Ok(()) + } + } + }; } + + fn_test_single_type!( + BooleanArray, + Boolean, + vec![Some(true), Some(false), None, Some(true), None] + ); + + fn_test_single_type!( + Int8Array, + Int8, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + Int16Array, + Int16, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + Int32Array, + Int32, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + Int64Array, + Int64, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + UInt8Array, + UInt8, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + UInt16Array, + UInt16, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + UInt32Array, + UInt32, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + UInt64Array, + UInt64, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + Float32Array, + Float32, + vec![Some(5.0), Some(7.0), None, Some(0.0), Some(111.0)] + ); + + fn_test_single_type!( + Float64Array, + Float64, + vec![Some(5.0), Some(7.0), None, Some(0.0), Some(111.0)] + ); + + fn_test_single_type!( + Date32Array, + Date32, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + Date64Array, + Date64, + vec![Some(5), Some(7), None, Some(0), Some(111)] + ); + + fn_test_single_type!( + StringArray, + Utf8, + vec![Some("hello"), Some("world"), None, Some(""), Some("")] + ); } diff --git a/datafusion/src/row/reader.rs b/datafusion/src/row/reader.rs index 23556d07d186..800db82532a3 100644 --- a/datafusion/src/row/reader.rs +++ b/datafusion/src/row/reader.rs @@ -18,9 +18,9 @@ //! Accessing row from raw bytes use crate::error::{DataFusionError, Result}; -use crate::row::bitmap::get_bit; -use crate::row::{get_offsets, null_width, supported}; -use arrow::array::{make_builder, Array, ArrayBuilder}; +use crate::row::bitmap::{all_valid, bytes_for, get_bit_unchecked}; +use crate::row::{get_offsets, supported}; +use arrow::array::{make_builder, ArrayBuilder}; use arrow::datatypes::{DataType, Schema}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; @@ -36,8 +36,8 @@ pub fn read_as_batch( let mut output = MutableRecordBatch::new(row_num, schema.clone()); let mut row = RowReader::new(&schema, data); - for i in 0..row_num { - row.point_to(offsets[i]); + for offset in offsets.iter().take(row_num) { + row.point_to(*offset); read_row(&row, &mut output, &schema)? } @@ -45,24 +45,24 @@ pub fn read_as_batch( } macro_rules! get_idx { - ($NATIVE: ident, $SELF: ident, $IDX: ident) => {{ + ($NATIVE: ident, $SELF: ident, $IDX: ident, $WIDTH: literal) => {{ $SELF.assert_index_valid($IDX); let offset = $SELF.field_offsets[$IDX]; - $NATIVE::from_le_bytes( - $SELF.data[$SELF.base_offset + offset..].try_into().unwrap(), - ) + let start = $SELF.base_offset + offset; + let end = start + $WIDTH; + $NATIVE::from_le_bytes($SELF.data[start..end].try_into().unwrap()) }}; } macro_rules! fn_get_idx { - ($NATIVE: ident) => { + ($NATIVE: ident, $WIDTH: literal) => { paste::item! { fn [](&self, idx: usize) -> $NATIVE { self.assert_index_valid(idx); let offset = self.field_offsets[idx]; - $NATIVE::from_le_bytes( - self.data[self.base_offset + offset..].try_into().unwrap(), - ) + let start = self.base_offset + offset; + let end = start + $WIDTH; + $NATIVE::from_le_bytes(self.data[start..end].try_into().unwrap()) } } }; @@ -86,19 +86,28 @@ struct RowReader<'a> { data: &'a [u8], base_offset: usize, field_count: usize, + null_width: usize, field_offsets: Vec, } +impl<'a> std::fmt::Debug for RowReader<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let null_bits = self.null_bits(); + super::bitmap::fmt(null_bits, 0, self.null_width, f) + } +} + impl<'a> RowReader<'a> { fn new(schema: &Arc, data: &'a [u8]) -> Self { - assert!(supported(&schema)); + assert!(supported(schema)); let field_count = schema.fields().len(); - let null_width = null_width(field_count); - let (field_offsets, _) = get_offsets(null_width, &schema); + let null_width = bytes_for(field_count); + let (field_offsets, _) = get_offsets(null_width, schema); Self { data, base_offset: 0, field_count, + null_width, field_offsets, } } @@ -113,12 +122,20 @@ impl<'a> RowReader<'a> { assert!(idx < self.field_count); } - // ---------------------- - // Accessors - // ---------------------- + #[inline(always)] + fn null_bits(&self) -> &[u8] { + let start = self.base_offset; + &self.data[start..start + self.null_width] + } + + #[inline(always)] + fn all_valid(&self) -> bool { + let null_bits = self.null_bits(); + all_valid(null_bits, self.field_count) + } fn is_valid_at(&self, idx: usize) -> bool { - get_bit(&self.data, idx) + unsafe { get_bit_unchecked(self.null_bits(), idx) } } fn get_bool(&self, idx: usize) -> bool { @@ -134,22 +151,22 @@ impl<'a> RowReader<'a> { self.data[self.base_offset + offset] } - fn_get_idx!(u16); - fn_get_idx!(u32); - fn_get_idx!(u64); - fn_get_idx!(i8); - fn_get_idx!(i16); - fn_get_idx!(i32); - fn_get_idx!(i64); - fn_get_idx!(f32); - fn_get_idx!(f64); + fn_get_idx!(u16, 2); + fn_get_idx!(u32, 4); + fn_get_idx!(u64, 8); + fn_get_idx!(i8, 1); + fn_get_idx!(i16, 2); + fn_get_idx!(i32, 4); + fn_get_idx!(i64, 8); + fn_get_idx!(f32, 4); + fn_get_idx!(f64, 8); fn get_date32(&self, idx: usize) -> i32 { - get_idx!(i32, self, idx) + get_idx!(i32, self, idx, 4) } fn get_date64(&self, idx: usize) -> i64 { - get_idx!(i64, self, idx) + get_idx!(i64, self, idx, 8) } fn get_utf8(&self, idx: usize) -> &str { @@ -206,14 +223,6 @@ impl<'a> RowReader<'a> { None } } - - fn get_binary_opt(&self, idx: usize) -> Option<&[u8]> { - if self.is_valid_at(idx) { - Some(self.get_binary(idx)) - } else { - None - } - } } fn read_row( @@ -221,13 +230,24 @@ fn read_row( batch: &mut MutableRecordBatch, schema: &Arc, ) -> Result<()> { - for ((col_idx, to), field) in batch - .arrays - .iter_mut() - .enumerate() - .zip(schema.fields().iter()) - { - read_field(to, field.data_type(), col_idx, row)? + if row.all_valid() { + for ((col_idx, to), field) in batch + .arrays + .iter_mut() + .enumerate() + .zip(schema.fields().iter()) + { + read_field_null_free(to, field.data_type(), col_idx, row)? + } + } else { + for ((col_idx, to), field) in batch + .arrays + .iter_mut() + .enumerate() + .zip(schema.fields().iter()) + { + read_field(to, field.data_type(), col_idx, row)? + } } Ok(()) } @@ -310,6 +330,80 @@ fn read_field( Ok(()) } +fn read_field_null_free( + to: &mut Box, + dt: &DataType, + col_idx: usize, + row: &RowReader, +) -> Result<()> { + use arrow::array::*; + use DataType::*; + match dt { + Boolean => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_bool(col_idx))?; + } + UInt8 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_u8(col_idx))?; + } + UInt16 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_u16(col_idx))?; + } + UInt32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_u32(col_idx))?; + } + UInt64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_u64(col_idx))?; + } + Int8 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_i8(col_idx))?; + } + Int16 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_i16(col_idx))?; + } + Int32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_i32(col_idx))?; + } + Int64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_i64(col_idx))?; + } + Float32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_f32(col_idx))?; + } + Float64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_f64(col_idx))?; + } + Date32 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_date32(col_idx))?; + } + Date64 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_date64(col_idx))?; + } + Utf8 => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_utf8(col_idx))?; + } + Binary => { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_binary(col_idx))?; + } + _ => unimplemented!(), + } + Ok(()) +} + struct MutableRecordBatch { arrays: Vec>, schema: Arc, diff --git a/datafusion/src/row/writer.rs b/datafusion/src/row/writer.rs index be36a03084cd..2aef6911c37c 100644 --- a/datafusion/src/row/writer.rs +++ b/datafusion/src/row/writer.rs @@ -17,8 +17,8 @@ //! Reusable row writer backed by Vec to stitch attributes together -use crate::row::bitmap::set_bit; -use crate::row::{estimate_row_width, fixed_size, get_offsets, null_width, supported}; +use crate::row::bitmap::{bytes_for, set_bit}; +use crate::row::{estimate_row_width, fixed_size, get_offsets, supported}; use arrow::array::Array; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; @@ -45,6 +45,7 @@ pub fn write_batch_unchecked( output[current_offset..current_offset + row_width] .copy_from_slice(writer.get_row()); current_offset += row_width; + writer.reset() } offsets } @@ -84,12 +85,12 @@ pub struct RowWriter { impl RowWriter { /// new pub fn new(schema: &Arc) -> Self { - assert!(supported(&schema)); + assert!(supported(schema)); let field_count = schema.fields().len(); - let null_width = null_width(field_count); - let (field_offsets, values_width) = get_offsets(null_width, &schema); - let mut init_capacity = estimate_row_width(&schema); - if !fixed_size(&schema) { + let null_width = bytes_for(field_count); + let (field_offsets, values_width) = get_offsets(null_width, schema); + let mut init_capacity = estimate_row_width(null_width, schema); + if !fixed_size(schema) { // double the capacity to avoid repeated resize init_capacity *= 2; } @@ -174,7 +175,7 @@ impl RowWriter { let size = bytes.len(); self.set_offset_size(idx, size); let varlena_offset = self.varlena_offset; - self.data[varlena_offset..varlena_offset + size].copy_from_slice(&bytes); + self.data[varlena_offset..varlena_offset + size].copy_from_slice(bytes); self.varlena_offset += size; self.varlena_width += size; } @@ -184,7 +185,7 @@ impl RowWriter { let size = value.len(); self.set_offset_size(idx, size); let varlena_offset = self.varlena_offset; - self.data[varlena_offset..varlena_offset + size].copy_from_slice(&value); + self.data[varlena_offset..varlena_offset + size].copy_from_slice(value); self.varlena_offset += size; self.varlena_width += size; } From 240e9876b33a8bf37b4ee8769f8c4211c75fb3cf Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Tue, 8 Feb 2022 18:54:31 +0800 Subject: [PATCH 6/8] Test with parquet data --- datafusion/src/row/mod.rs | 101 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/datafusion/src/row/mod.rs b/datafusion/src/row/mod.rs index 340511ae7062..a48fb46f0432 100644 --- a/datafusion/src/row/mod.rs +++ b/datafusion/src/row/mod.rs @@ -119,7 +119,16 @@ fn supported(schema: &Arc) -> bool { #[cfg(test)] mod tests { use super::*; + use crate::datasource::file_format::parquet::ParquetFormat; + use crate::datasource::file_format::FileFormat; + use crate::datasource::object_store::local::{ + local_object_reader, local_object_reader_stream, local_unpartitioned_file, + LocalFileSystem, + }; use crate::error::Result; + use crate::execution::runtime_env::RuntimeEnv; + use crate::physical_plan::file_format::FileScanConfig; + use crate::physical_plan::{collect, ExecutionPlan}; use crate::row::reader::read_as_batch; use crate::row::writer::write_batch_unchecked; use arrow::record_batch::RecordBatch; @@ -229,4 +238,96 @@ mod tests { Utf8, vec![Some("hello"), Some("world"), None, Some(""), Some("")] ); + + #[test] + fn test_single_binary() -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new("a", Binary, false)])); + let values: Vec> = + vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")]; + let a = BinaryArray::from_opt_vec(values); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(a)])?; + let mut vector = vec![0; 8192]; + let row_offsets = + { write_batch_unchecked(&mut vector, 0, &batch, 0, schema.clone()) }; + let output_batch = { read_as_batch(&mut vector, schema, row_offsets)? }; + assert_eq!(batch, output_batch); + Ok(()) + } + + #[tokio::test] + async fn test_with_parquet() -> Result<()> { + let runtime = Arc::new(RuntimeEnv::default()); + let projection = Some(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]); + let exec = get_exec("alltypes_plain.parquet", &projection, None).await?; + let schema = exec.schema().clone(); + + let batches = collect(exec, runtime).await?; + assert_eq!(1, batches.len()); + let batch = &batches[0]; + + let mut vector = vec![0; 20480]; + let row_offsets = + { write_batch_unchecked(&mut vector, 0, batch, 0, schema.clone()) }; + let output_batch = { read_as_batch(&mut vector, schema, row_offsets)? }; + assert_eq!(*batch, output_batch); + + Ok(()) + } + + #[test] + #[should_panic(expected = "supported(schema)")] + fn test_unsupported_type_write() { + let a: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap(); + let schema = batch.schema(); + let mut vector = vec![0; 1024]; + write_batch_unchecked(&mut vector, 0, &batch, 0, schema); + } + + #[test] + #[should_panic(expected = "supported(schema)")] + fn test_unsupported_type_read() { + let schema = Arc::new(Schema::new(vec![Field::new( + "a", + DataType::Decimal(5, 2), + false, + )])); + let mut vector = vec![0; 1024]; + let row_offsets = vec![0]; + read_as_batch(&mut vector, schema, row_offsets).unwrap(); + } + + async fn get_exec( + file_name: &str, + projection: &Option>, + limit: Option, + ) -> Result> { + let testdata = crate::test_util::parquet_test_data(); + let filename = format!("{}/{}", testdata, file_name); + let format = ParquetFormat::default(); + let file_schema = format + .infer_schema(local_object_reader_stream(vec![filename.clone()])) + .await + .expect("Schema inference"); + let statistics = format + .infer_stats(local_object_reader(filename.clone())) + .await + .expect("Stats inference"); + let file_groups = vec![vec![local_unpartitioned_file(filename.clone())]]; + let exec = format + .create_physical_plan( + FileScanConfig { + object_store: Arc::new(LocalFileSystem {}), + file_schema, + file_groups, + statistics, + projection: projection.clone(), + limit, + table_partition_cols: vec![], + }, + &[], + ) + .await?; + Ok(exec) + } } From 1bcdc99763aef432715f7adff1b0dbe7738687b5 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Wed, 9 Feb 2022 17:17:13 +0800 Subject: [PATCH 7/8] Resolve part of the comments --- datafusion/src/row/bitmap.rs | 194 +++++++++++++++++++++++++++++++ datafusion/src/row/bitmap/fmt.rs | 132 --------------------- datafusion/src/row/bitmap/mod.rs | 126 -------------------- datafusion/src/row/mod.rs | 5 +- datafusion/src/row/reader.rs | 27 +++-- datafusion/src/row/writer.rs | 46 ++++---- 6 files changed, 243 insertions(+), 287 deletions(-) create mode 100644 datafusion/src/row/bitmap.rs delete mode 100644 datafusion/src/row/bitmap/fmt.rs delete mode 100644 datafusion/src/row/bitmap/mod.rs diff --git a/datafusion/src/row/bitmap.rs b/datafusion/src/row/bitmap.rs new file mode 100644 index 000000000000..b6cbd83d0850 --- /dev/null +++ b/datafusion/src/row/bitmap.rs @@ -0,0 +1,194 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! General utilities for null bit section handling based on [arrow::util::bit_util] + +use arrow::util::bit_util::{ + ceil, get_bit_raw, round_upto_power_of_2, set_bit_raw, unset_bit_raw, +}; +use std::fmt::Write; + +const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; +const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255]; + +/// Returns whether bit at position `i` in `byte` is set or not +#[inline] +pub fn is_set(byte: u8, i: usize) -> bool { + (byte & BIT_MASK[i]) != 0 +} + +/// Sets bit at position `i` in `data` +#[inline] +pub fn set_bit(data: &mut [u8], i: usize) { + unsafe { + set_bit_raw(data.as_mut_ptr(), i); + } +} + +/// Unsets bit at position `i` in `data` +#[inline] +pub fn unset_bit(data: &mut [u8], i: usize) { + unsafe { + unset_bit_raw(data.as_mut_ptr(), i); + } +} + +/// Returns whether bit at position `i` in `data` is set or not. +#[inline] +pub fn get_bit(data: &[u8], i: usize) -> bool { + unsafe { get_bit_raw(data.as_ptr(), i) } +} + +/// Returns the number of bytes required to hold `n` bits. +#[inline] +pub fn null_width(n: usize) -> usize { + ceil(n, 8) +} + +#[inline] +pub fn align_word(n: usize) -> usize { + round_upto_power_of_2(n, 8) +} + +/// Returns if all fields are valid +pub fn all_valid(data: &[u8], n: usize) -> bool { + for item in data.iter().take(n / 8) { + if *item != ALL_VALID_MASK[7] { + return false; + } + } + if n % 8 == 0 { + true + } else { + data[n / 8] == ALL_VALID_MASK[n % 8 - 1] + } +} + +/// Show null bit for each field in a tuple, 1 for valid and 0 for null. +/// For a tuple with nine total fields, valid at field 0, 6, 7, 8 shows as `[10000011, 1]`. +pub struct NullBitsFormatter<'a> { + null_bits: &'a [u8], + field_count: usize, +} + +impl<'a> NullBitsFormatter<'a> { + /// new + pub fn new(null_bits: &'a [u8], field_count: usize) -> Self { + Self { + null_bits, + field_count, + } + } +} + +impl<'a> std::fmt::Debug for NullBitsFormatter<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut is_first = true; + for i in 0..self.field_count { + if is_first { + f.write_char('[')?; + is_first = false; + } else if i % 8 == 0 { + f.write_str(", ")?; + } + if get_bit(self.null_bits, i) { + f.write_char('1')?; + } else { + f.write_char('0')?; + } + } + f.write_char(']')?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::Rng; + + fn test_validity(bs: &[bool]) { + let mut data = vec![0; null_width(bs.len())]; + for (i, b) in bs.iter().enumerate() { + if *b { + set_bit(&mut data, i); + } else { + unset_bit(&mut data, i); + } + } + let expected = bs.iter().all(|f| *f); + assert_eq!(all_valid(&data, bs.len()), expected); + } + + #[test] + fn test_all_valid() { + let sizes = [4, 8, 12, 16, 19, 23, 32, 44]; + for i in sizes { + { + // contains false + let input = { + let mut rng = rand::thread_rng(); + let mut input: Vec = vec![false; i]; + rng.fill(&mut input[..]); + input + }; + test_validity(&input); + } + + { + // all true + let input = vec![true; i]; + test_validity(&input); + } + } + } + + #[test] + fn test_formatter() -> std::fmt::Result { + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001], 8)), + "[10000011]" + ); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1], 9)), + "[10000011, 1]" + ); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 2)), "[10]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 3)), "[100]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 4)), "[1000]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 5)), "[10000]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 6)), "[100000]"); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[1], 7)), + "[1000000]" + ); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[1], 8)), + "[10000000]" + ); + // extra bytes are ignored + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1, 1], 9)), + "[10000011, 1]" + ); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1], 16)), + "[10000011, 10000000]" + ); + Ok(()) + } +} diff --git a/datafusion/src/row/bitmap/fmt.rs b/datafusion/src/row/bitmap/fmt.rs deleted file mode 100644 index 0dbc81ba1234..000000000000 --- a/datafusion/src/row/bitmap/fmt.rs +++ /dev/null @@ -1,132 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::fmt::Write; - -use super::is_set; - -/// Formats `bytes` taking into account an offset and length of the form -pub fn fmt( - bytes: &[u8], - offset: usize, - length: usize, - f: &mut std::fmt::Formatter<'_>, -) -> std::fmt::Result { - assert!(offset < 8); - - f.write_char('[')?; - let mut remaining = length; - if remaining == 0 { - f.write_char(']')?; - return Ok(()); - } - - let first = bytes[0]; - let bytes = &bytes[1..]; - let empty_before = 8usize.saturating_sub(remaining + offset); - f.write_str("0b")?; - for _ in 0..empty_before { - f.write_char('_')?; - } - let until = std::cmp::min(8, offset + remaining); - for i in offset..until { - if is_set(first, offset + until - 1 - i) { - f.write_char('1')?; - } else { - f.write_char('0')?; - } - } - for _ in 0..offset { - f.write_char('_')?; - } - remaining -= until - offset; - - if remaining == 0 { - f.write_char(']')?; - return Ok(()); - } - - let number_of_bytes = remaining / 8; - for byte in &bytes[..number_of_bytes] { - f.write_str(", ")?; - f.write_fmt(format_args!("{:#010b}", byte))?; - } - remaining -= number_of_bytes * 8; - if remaining == 0 { - f.write_char(']')?; - return Ok(()); - } - - let last = bytes[std::cmp::min((length + offset + 7) / 8, bytes.len() - 1)]; - let remaining = (length + offset) % 8; - f.write_str(", ")?; - f.write_str("0b")?; - for _ in 0..(8 - remaining) { - f.write_char('_')?; - } - for i in 0..remaining { - if is_set(last, remaining - 1 - i) { - f.write_char('1')?; - } else { - f.write_char('0')?; - } - } - f.write_char(']') -} - -#[cfg(test)] -mod tests { - use super::*; - - struct A<'a>(&'a [u8], usize, usize); - impl<'a> std::fmt::Debug for A<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - fmt(self.0, self.1, self.2, f) - } - } - - #[test] - fn test_debug() -> std::fmt::Result { - assert_eq!(format!("{:?}", A(&[1], 0, 0)), "[]"); - assert_eq!(format!("{:?}", A(&[0b11000001], 0, 8)), "[0b11000001]"); - assert_eq!( - format!("{:?}", A(&[0b11000001, 1], 0, 9)), - "[0b11000001, 0b_______1]" - ); - assert_eq!(format!("{:?}", A(&[1], 0, 2)), "[0b______01]"); - assert_eq!(format!("{:?}", A(&[1], 1, 2)), "[0b_____00_]"); - assert_eq!(format!("{:?}", A(&[1], 2, 2)), "[0b____00__]"); - assert_eq!(format!("{:?}", A(&[1], 3, 2)), "[0b___00___]"); - assert_eq!(format!("{:?}", A(&[1], 4, 2)), "[0b__00____]"); - assert_eq!(format!("{:?}", A(&[1], 5, 2)), "[0b_00_____]"); - assert_eq!(format!("{:?}", A(&[1], 6, 2)), "[0b00______]"); - assert_eq!( - format!("{:?}", A(&[0b11000001, 1], 1, 9)), - "[0b1100000_, 0b______01]" - ); - // extra bytes are ignored - assert_eq!( - format!("{:?}", A(&[0b11000001, 1, 1, 1], 1, 9)), - "[0b1100000_, 0b______01]" - ); - assert_eq!( - format!("{:?}", A(&[0b11000001, 1, 1], 2, 16)), - "[0b110000__, 0b00000001, 0b______01]" - ); - Ok(()) - } -} diff --git a/datafusion/src/row/bitmap/mod.rs b/datafusion/src/row/bitmap/mod.rs deleted file mode 100644 index 79a21f334bab..000000000000 --- a/datafusion/src/row/bitmap/mod.rs +++ /dev/null @@ -1,126 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! General utilities for null bit section handling -//! -//! Note: this is a tailored version based on [arrow2 bitmap utils](https://github.com/jorgecarleitao/arrow2/tree/main/src/bitmap/utils) - -mod fmt; - -pub use fmt::fmt; - -const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; -const UNSET_BIT_MASK: [u8; 8] = [ - 255 - 1, - 255 - 2, - 255 - 4, - 255 - 8, - 255 - 16, - 255 - 32, - 255 - 64, - 255 - 128, -]; -const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255]; - -/// Returns whether bit at position `i` in `byte` is set or not -#[inline] -pub fn is_set(byte: u8, i: usize) -> bool { - (byte & BIT_MASK[i]) != 0 -} - -/// Sets bit at position `i` in `byte` -#[inline] -pub fn set(byte: u8, i: usize, value: bool) -> u8 { - if value { - byte | BIT_MASK[i] - } else { - byte & UNSET_BIT_MASK[i] - } -} - -/// Sets bit at position `i` in `data` -#[inline] -pub fn set_bit(data: &mut [u8], i: usize, value: bool) { - data[i / 8] = set(data[i / 8], i % 8, value); -} - -/// Returns whether bit at position `i` in `data` is set or not. -/// -/// # Safety -/// `i >= data.len() * 8` results in undefined behavior -#[inline] -pub unsafe fn get_bit_unchecked(data: &[u8], i: usize) -> bool { - (*data.as_ptr().add(i >> 3) & BIT_MASK[i & 7]) != 0 -} - -/// Returns the number of bytes required to hold `bits` bits. -#[inline] -pub fn bytes_for(bits: usize) -> usize { - bits.saturating_add(7) / 8 -} - -/// Returns if all fields are valid -pub fn all_valid(data: &[u8], n: usize) -> bool { - for item in data.iter().take(n / 8) { - if *item != ALL_VALID_MASK[7] { - return false; - } - } - if n % 8 == 0 { - true - } else { - data[n / 8] == ALL_VALID_MASK[n % 8 - 1] - } -} - -#[cfg(test)] -mod tests { - use super::*; - use rand::Rng; - - fn test_validity(bs: &[bool]) { - let mut data = vec![0; bytes_for(bs.len())]; - for (i, b) in bs.iter().enumerate() { - set_bit(&mut data, i, *b); - } - let expected = bs.iter().all(|f| *f); - assert_eq!(all_valid(&data, bs.len()), expected); - } - - #[test] - fn test_all_valid() { - let sizes = [4, 8, 12, 16, 19, 23, 32, 44]; - for i in sizes { - { - // contains false - let input = { - let mut rng = rand::thread_rng(); - let mut input: Vec = vec![false; i]; - rng.fill(&mut input[..]); - input - }; - test_validity(&input); - } - - { - // all true - let input = vec![true; i]; - test_validity(&input); - } - } - } -} diff --git a/datafusion/src/row/mod.rs b/datafusion/src/row/mod.rs index a48fb46f0432..5caad7829521 100644 --- a/datafusion/src/row/mod.rs +++ b/datafusion/src/row/mod.rs @@ -30,6 +30,7 @@ //! we append their actual content to the end of the var length region and //! store their offset relative to row base and their length, packed into an 8-byte word. +use crate::row::bitmap::align_word; use arrow::datatypes::{DataType, Schema}; use std::sync::Arc; @@ -81,7 +82,7 @@ fn var_length(dt: &DataType) -> bool { fn type_width(dt: &DataType) -> usize { use DataType::*; if var_length(dt) { - return 8; + return std::mem::size_of::(); } match dt { Boolean | UInt8 | Int8 => 1, @@ -102,7 +103,7 @@ fn estimate_row_width(null_width: usize, schema: &Arc) -> usize { _ => {} } } - (width.saturating_add(7) / 8) * 8 + align_word(width) } fn fixed_size(schema: &Arc) -> bool { diff --git a/datafusion/src/row/reader.rs b/datafusion/src/row/reader.rs index 800db82532a3..614084046fa1 100644 --- a/datafusion/src/row/reader.rs +++ b/datafusion/src/row/reader.rs @@ -18,7 +18,7 @@ //! Accessing row from raw bytes use crate::error::{DataFusionError, Result}; -use crate::row::bitmap::{all_valid, bytes_for, get_bit_unchecked}; +use crate::row::bitmap::{all_valid, get_bit, null_width, NullBitsFormatter}; use crate::row::{get_offsets, supported}; use arrow::array::{make_builder, ArrayBuilder}; use arrow::datatypes::{DataType, Schema}; @@ -82,26 +82,39 @@ macro_rules! fn_get_idx_opt { }; } -struct RowReader<'a> { +/// Read the tuple `data[base_offset..]` we are currently pointing to +pub struct RowReader<'a> { + /// Raw bytes slice where the tuple stores data: &'a [u8], + /// Start position for the current tuple in the raw bytes slice. base_offset: usize, + /// Total number of fields for each tuple. field_count: usize, + /// The number of bytes used to store null bits for each field. null_width: usize, + /// Starting offset for each fields in the raw bytes. + /// For fixed length fields, it's where the actual data stores. + /// For variable length fields, it's a pack of (offset << 32 | length) if we use u64. field_offsets: Vec, } impl<'a> std::fmt::Debug for RowReader<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let null_bits = self.null_bits(); - super::bitmap::fmt(null_bits, 0, self.null_width, f) + write!( + f, + "{:?}", + NullBitsFormatter::new(null_bits, self.field_count) + ) } } impl<'a> RowReader<'a> { - fn new(schema: &Arc, data: &'a [u8]) -> Self { + /// new + pub fn new(schema: &Arc, data: &'a [u8]) -> Self { assert!(supported(schema)); let field_count = schema.fields().len(); - let null_width = bytes_for(field_count); + let null_width = null_width(field_count); let (field_offsets, _) = get_offsets(null_width, schema); Self { data, @@ -113,7 +126,7 @@ impl<'a> RowReader<'a> { } /// Update this row to point to position `offset` in `base` - fn point_to(&mut self, offset: usize) { + pub fn point_to(&mut self, offset: usize) { self.base_offset = offset; } @@ -135,7 +148,7 @@ impl<'a> RowReader<'a> { } fn is_valid_at(&self, idx: usize) -> bool { - unsafe { get_bit_unchecked(self.null_bits(), idx) } + get_bit(self.null_bits(), idx) } fn get_bool(&self, idx: usize) -> bool { diff --git a/datafusion/src/row/writer.rs b/datafusion/src/row/writer.rs index 2aef6911c37c..e0421b057daa 100644 --- a/datafusion/src/row/writer.rs +++ b/datafusion/src/row/writer.rs @@ -17,7 +17,7 @@ //! Reusable row writer backed by Vec to stitch attributes together -use crate::row::bitmap::{bytes_for, set_bit}; +use crate::row::bitmap::{align_word, null_width, set_bit, unset_bit}; use crate::row::{estimate_row_width, fixed_size, get_offsets, supported}; use arrow::array::Array; use arrow::datatypes::{DataType, Schema}; @@ -41,7 +41,7 @@ pub fn write_batch_unchecked( let mut offsets = vec![]; for cur_row in row_idx..batch.num_rows() { offsets.push(current_offset); - let row_width = write_row(&mut writer, cur_row, batch, &schema); + let row_width = write_row(&mut writer, cur_row, batch); output[current_offset..current_offset + row_width] .copy_from_slice(writer.get_row()); current_offset += row_width; @@ -72,13 +72,23 @@ macro_rules! fn_set_idx { /// Reusable row writer backed by Vec pub struct RowWriter { + /// buffer for the current tuple been written. data: Vec, + /// Total number of fields for each tuple. field_count: usize, + /// Length in bytes for the current tuple, 8-bytes word aligned. row_width: usize, + /// The number of bytes used to store null bits for each field. null_width: usize, + /// Length in bytes for `values` part of the current tuple. values_width: usize, + /// Length in bytes for `variable length data` part of the current tuple. varlena_width: usize, + /// Current offset for the next variable length field to write to. varlena_offset: usize, + /// Starting offset for each fields in the raw bytes. + /// For fixed length fields, it's where the actual data stores. + /// For variable length fields, it's a pack of (offset << 32 | length) if we use u64. field_offsets: Vec, } @@ -87,7 +97,7 @@ impl RowWriter { pub fn new(schema: &Arc) -> Self { assert!(supported(schema)); let field_count = schema.fields().len(); - let null_width = bytes_for(field_count); + let null_width = null_width(field_count); let (field_offsets, values_width) = get_offsets(null_width, schema); let mut init_capacity = estimate_row_width(null_width, schema); if !fixed_size(schema) { @@ -121,12 +131,12 @@ impl RowWriter { fn set_null_at(&mut self, idx: usize) { let null_bits = &mut self.data[0..self.null_width]; - set_bit(null_bits, idx, false) + unset_bit(null_bits, idx) } fn set_non_null_at(&mut self, idx: usize) { let null_bits = &mut self.data[0..self.null_width]; - set_bit(null_bits, idx, true) + set_bit(null_bits, idx) } fn set_bool(&mut self, idx: usize, value: bool) { @@ -164,8 +174,8 @@ impl RowWriter { set_idx!(8, self, idx, value) } - fn set_offset_size(&mut self, idx: usize, size: usize) { - let offset_and_size: u64 = (self.varlena_offset << 32 | size) as u64; + fn set_offset_size(&mut self, idx: usize, size: u32) { + let offset_and_size: u64 = (self.varlena_offset as u64) << 32 | (size as u64); self.set_u64(idx, offset_and_size); } @@ -173,7 +183,7 @@ impl RowWriter { self.assert_index_valid(idx); let bytes = value.as_bytes(); let size = bytes.len(); - self.set_offset_size(idx, size); + self.set_offset_size(idx, size as u32); let varlena_offset = self.varlena_offset; self.data[varlena_offset..varlena_offset + size].copy_from_slice(bytes); self.varlena_offset += size; @@ -183,7 +193,7 @@ impl RowWriter { fn set_binary(&mut self, idx: usize, value: &[u8]) { self.assert_index_valid(idx); let size = value.len(); - self.set_offset_size(idx, size); + self.set_offset_size(idx, size as u32); let varlena_offset = self.varlena_offset; self.data[varlena_offset..varlena_offset + size].copy_from_slice(value); self.varlena_offset += size; @@ -197,7 +207,7 @@ impl RowWriter { /// End each row at 8-byte word boundary. fn end_padding(&mut self) { let payload_width = self.current_width(); - self.row_width = (payload_width.saturating_add(7) / 8) * 8; + self.row_width = align_word(payload_width); if self.data.capacity() < self.row_width { self.data.resize(self.row_width, 0); } @@ -209,14 +219,10 @@ impl RowWriter { } /// Stitch attributes of tuple in `batch` at `row_idx` and returns the tuple width -fn write_row( - row: &mut RowWriter, - row_idx: usize, - batch: &RecordBatch, - schema: &Arc, -) -> usize { +fn write_row(row: &mut RowWriter, row_idx: usize, batch: &RecordBatch) -> usize { // Get the row from the batch denoted by row_idx - for ((i, f), col) in schema + for ((i, f), col) in batch + .schema() .fields() .iter() .enumerate() @@ -299,13 +305,13 @@ fn write_field( } Utf8 => { let c = col.as_any().downcast_ref::().unwrap(); - let str = c.value(row_idx); - let new_width = row.current_width() + str.as_bytes().len(); + let s = c.value(row_idx); + let new_width = row.current_width() + s.as_bytes().len(); if new_width > row.data.capacity() { // double the capacity to avoid repeated resize row.data.resize(max(row.data.capacity() * 2, new_width), 0); } - row.set_utf8(col_idx, str); + row.set_utf8(col_idx, s); } Binary => { let c = col.as_any().downcast_ref::().unwrap(); From 1a2f08eda5d25b5784d17217f922cd9fbe48c341 Mon Sep 17 00:00:00 2001 From: Yijie Shen Date: Fri, 11 Feb 2022 01:00:37 +0800 Subject: [PATCH 8/8] Resolve comments --- Cargo.toml | 4 - datafusion/Cargo.toml | 2 + datafusion/src/lib.rs | 2 +- datafusion/src/row/bitmap.rs | 194 ---------------------------- datafusion/src/row/mod.rs | 158 ++++++++++++++++++++++- datafusion/src/row/reader.rs | 242 ++++++++++++++--------------------- datafusion/src/row/writer.rs | 14 +- 7 files changed, 265 insertions(+), 351 deletions(-) delete mode 100644 datafusion/src/row/bitmap.rs diff --git a/Cargo.toml b/Cargo.toml index 126fbf7db682..ea1acc04e687 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,3 @@ members = [ [profile.release] lto = true codegen-units = 1 - -[patch.crates-io] -arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" } -parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "731e132489b99cd688f884642cf20de52aed24d0" } diff --git a/datafusion/Cargo.toml b/datafusion/Cargo.toml index 54247cbcf07c..21e36121fe5a 100644 --- a/datafusion/Cargo.toml +++ b/datafusion/Cargo.toml @@ -48,6 +48,8 @@ pyarrow = ["pyo3", "arrow/pyarrow"] force_hash_collisions = [] # Used to enable the avro format avro = ["avro-rs", "num-traits"] +# Used to enable row format experiment +row = [] [dependencies] ahash = { version = "0.7", default-features = false } diff --git a/datafusion/src/lib.rs b/datafusion/src/lib.rs index a0f8279f709c..17b590fb4e96 100644 --- a/datafusion/src/lib.rs +++ b/datafusion/src/lib.rs @@ -223,7 +223,7 @@ pub use arrow; pub use parquet; pub(crate) mod field_util; -#[allow(dead_code)] +#[cfg(feature = "row")] pub(crate) mod row; #[cfg(feature = "pyarrow")] diff --git a/datafusion/src/row/bitmap.rs b/datafusion/src/row/bitmap.rs deleted file mode 100644 index b6cbd83d0850..000000000000 --- a/datafusion/src/row/bitmap.rs +++ /dev/null @@ -1,194 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! General utilities for null bit section handling based on [arrow::util::bit_util] - -use arrow::util::bit_util::{ - ceil, get_bit_raw, round_upto_power_of_2, set_bit_raw, unset_bit_raw, -}; -use std::fmt::Write; - -const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128]; -const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255]; - -/// Returns whether bit at position `i` in `byte` is set or not -#[inline] -pub fn is_set(byte: u8, i: usize) -> bool { - (byte & BIT_MASK[i]) != 0 -} - -/// Sets bit at position `i` in `data` -#[inline] -pub fn set_bit(data: &mut [u8], i: usize) { - unsafe { - set_bit_raw(data.as_mut_ptr(), i); - } -} - -/// Unsets bit at position `i` in `data` -#[inline] -pub fn unset_bit(data: &mut [u8], i: usize) { - unsafe { - unset_bit_raw(data.as_mut_ptr(), i); - } -} - -/// Returns whether bit at position `i` in `data` is set or not. -#[inline] -pub fn get_bit(data: &[u8], i: usize) -> bool { - unsafe { get_bit_raw(data.as_ptr(), i) } -} - -/// Returns the number of bytes required to hold `n` bits. -#[inline] -pub fn null_width(n: usize) -> usize { - ceil(n, 8) -} - -#[inline] -pub fn align_word(n: usize) -> usize { - round_upto_power_of_2(n, 8) -} - -/// Returns if all fields are valid -pub fn all_valid(data: &[u8], n: usize) -> bool { - for item in data.iter().take(n / 8) { - if *item != ALL_VALID_MASK[7] { - return false; - } - } - if n % 8 == 0 { - true - } else { - data[n / 8] == ALL_VALID_MASK[n % 8 - 1] - } -} - -/// Show null bit for each field in a tuple, 1 for valid and 0 for null. -/// For a tuple with nine total fields, valid at field 0, 6, 7, 8 shows as `[10000011, 1]`. -pub struct NullBitsFormatter<'a> { - null_bits: &'a [u8], - field_count: usize, -} - -impl<'a> NullBitsFormatter<'a> { - /// new - pub fn new(null_bits: &'a [u8], field_count: usize) -> Self { - Self { - null_bits, - field_count, - } - } -} - -impl<'a> std::fmt::Debug for NullBitsFormatter<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let mut is_first = true; - for i in 0..self.field_count { - if is_first { - f.write_char('[')?; - is_first = false; - } else if i % 8 == 0 { - f.write_str(", ")?; - } - if get_bit(self.null_bits, i) { - f.write_char('1')?; - } else { - f.write_char('0')?; - } - } - f.write_char(']')?; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use rand::Rng; - - fn test_validity(bs: &[bool]) { - let mut data = vec![0; null_width(bs.len())]; - for (i, b) in bs.iter().enumerate() { - if *b { - set_bit(&mut data, i); - } else { - unset_bit(&mut data, i); - } - } - let expected = bs.iter().all(|f| *f); - assert_eq!(all_valid(&data, bs.len()), expected); - } - - #[test] - fn test_all_valid() { - let sizes = [4, 8, 12, 16, 19, 23, 32, 44]; - for i in sizes { - { - // contains false - let input = { - let mut rng = rand::thread_rng(); - let mut input: Vec = vec![false; i]; - rng.fill(&mut input[..]); - input - }; - test_validity(&input); - } - - { - // all true - let input = vec![true; i]; - test_validity(&input); - } - } - } - - #[test] - fn test_formatter() -> std::fmt::Result { - assert_eq!( - format!("{:?}", NullBitsFormatter::new(&[0b11000001], 8)), - "[10000011]" - ); - assert_eq!( - format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1], 9)), - "[10000011, 1]" - ); - assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 2)), "[10]"); - assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 3)), "[100]"); - assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 4)), "[1000]"); - assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 5)), "[10000]"); - assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 6)), "[100000]"); - assert_eq!( - format!("{:?}", NullBitsFormatter::new(&[1], 7)), - "[1000000]" - ); - assert_eq!( - format!("{:?}", NullBitsFormatter::new(&[1], 8)), - "[10000000]" - ); - // extra bytes are ignored - assert_eq!( - format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1, 1], 9)), - "[10000011, 1]" - ); - assert_eq!( - format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1], 16)), - "[10000011, 10000000]" - ); - Ok(()) - } -} diff --git a/datafusion/src/row/mod.rs b/datafusion/src/row/mod.rs index 5caad7829521..9875b84975e2 100644 --- a/datafusion/src/row/mod.rs +++ b/datafusion/src/row/mod.rs @@ -29,18 +29,90 @@ //! - For fields of non-primitive or variable-length types, //! we append their actual content to the end of the var length region and //! store their offset relative to row base and their length, packed into an 8-byte word. +//! +//! ┌────────────────┬──────────────────────────┬───────────────────────┐ ┌───────────────────────┬────────────┐ +//! │Validity Bitmask│ Fixed Width Field │ Variable Width Field │ ... │ vardata area │ padding │ +//! │ (byte aligned) │ (native type width) │(vardata offset + len) │ │ (variable length) │ bytes │ +//! └────────────────┴──────────────────────────┴───────────────────────┘ └───────────────────────┴────────────┘ +//! +//! For example, given the schema (Int8, Utf8, Float32, Utf8) +//! +//! Encoding the tuple (1, "FooBar", NULL, "baz") +//! +//! Requires 32 bytes (31 bytes payload and 1 byte padding to make each tuple 8-bytes aligned): +//! +//! ┌──────────┬──────────┬──────────────────────┬──────────────┬──────────────────────┬───────────────────────┬──────────┐ +//! │0b00001011│ 0x01 │0x00000016 0x00000006│ 0x00000000 │0x0000001C 0x00000003│ FooBarbaz │ 0x00 │ +//! └──────────┴──────────┴──────────────────────┴──────────────┴──────────────────────┴───────────────────────┴──────────┘ +//! 0 1 2 10 14 22 31 32 +//! -use crate::row::bitmap::align_word; use arrow::datatypes::{DataType, Schema}; +use arrow::util::bit_util::{get_bit_raw, round_upto_power_of_2}; +use std::fmt::Write; use std::sync::Arc; -mod bitmap; mod reader; mod writer; +const ALL_VALID_MASK: [u8; 8] = [1, 3, 7, 15, 31, 63, 127, 255]; + const UTF8_DEFAULT_SIZE: usize = 20; const BINARY_DEFAULT_SIZE: usize = 100; +/// Returns if all fields are valid +pub fn all_valid(data: &[u8], n: usize) -> bool { + for item in data.iter().take(n / 8) { + if *item != ALL_VALID_MASK[7] { + return false; + } + } + if n % 8 == 0 { + true + } else { + data[n / 8] == ALL_VALID_MASK[n % 8 - 1] + } +} + +/// Show null bit for each field in a tuple, 1 for valid and 0 for null. +/// For a tuple with nine total fields, valid at field 0, 6, 7, 8 shows as `[10000011, 1]`. +pub struct NullBitsFormatter<'a> { + null_bits: &'a [u8], + field_count: usize, +} + +impl<'a> NullBitsFormatter<'a> { + /// new + pub fn new(null_bits: &'a [u8], field_count: usize) -> Self { + Self { + null_bits, + field_count, + } + } +} + +impl<'a> std::fmt::Debug for NullBitsFormatter<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut is_first = true; + let data = self.null_bits; + for i in 0..self.field_count { + if is_first { + f.write_char('[')?; + is_first = false; + } else if i % 8 == 0 { + f.write_str(", ")?; + } + if unsafe { get_bit_raw(data.as_ptr(), i) } { + f.write_char('1')?; + } else { + f.write_char('0')?; + } + } + f.write_char(']')?; + Ok(()) + } +} + /// Get relative offsets for each field and total width for values fn get_offsets(null_width: usize, schema: &Arc) -> (Vec, usize) { let mut offsets = vec![]; @@ -103,7 +175,7 @@ fn estimate_row_width(null_width: usize, schema: &Arc) -> usize { _ => {} } } - align_word(width) + round_upto_power_of_2(width, 8) } fn fixed_size(schema: &Arc) -> bool { @@ -133,9 +205,89 @@ mod tests { use crate::row::reader::read_as_batch; use crate::row::writer::write_batch_unchecked; use arrow::record_batch::RecordBatch; + use arrow::util::bit_util::{ceil, set_bit_raw, unset_bit_raw}; use arrow::{array::*, datatypes::*}; + use rand::Rng; use DataType::*; + fn test_validity(bs: &[bool]) { + let n = bs.len(); + let mut data = vec![0; ceil(n, 8)]; + for (i, b) in bs.iter().enumerate() { + if *b { + let data_argument = &mut data; + unsafe { + set_bit_raw(data_argument.as_mut_ptr(), i); + }; + } else { + let data_argument = &mut data; + unsafe { + unset_bit_raw(data_argument.as_mut_ptr(), i); + }; + } + } + let expected = bs.iter().all(|f| *f); + assert_eq!(all_valid(&data, bs.len()), expected); + } + + #[test] + fn test_all_valid() { + let sizes = [4, 8, 12, 16, 19, 23, 32, 44]; + for i in sizes { + { + // contains false + let input = { + let mut rng = rand::thread_rng(); + let mut input: Vec = vec![false; i]; + rng.fill(&mut input[..]); + input + }; + test_validity(&input); + } + + { + // all true + let input = vec![true; i]; + test_validity(&input); + } + } + } + + #[test] + fn test_formatter() -> std::fmt::Result { + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001], 8)), + "[10000011]" + ); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1], 9)), + "[10000011, 1]" + ); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 2)), "[10]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 3)), "[100]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 4)), "[1000]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 5)), "[10000]"); + assert_eq!(format!("{:?}", NullBitsFormatter::new(&[1], 6)), "[100000]"); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[1], 7)), + "[1000000]" + ); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[1], 8)), + "[10000000]" + ); + // extra bytes are ignored + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1, 1], 9)), + "[10000011, 1]" + ); + assert_eq!( + format!("{:?}", NullBitsFormatter::new(&[0b11000001, 1, 1], 16)), + "[10000011, 10000000]" + ); + Ok(()) + } + macro_rules! fn_test_single_type { ($ARRAY: ident, $TYPE: expr, $VEC: expr) => { paste::item! { diff --git a/datafusion/src/row/reader.rs b/datafusion/src/row/reader.rs index 614084046fa1..779c09990ffc 100644 --- a/datafusion/src/row/reader.rs +++ b/datafusion/src/row/reader.rs @@ -18,12 +18,12 @@ //! Accessing row from raw bytes use crate::error::{DataFusionError, Result}; -use crate::row::bitmap::{all_valid, get_bit, null_width, NullBitsFormatter}; -use crate::row::{get_offsets, supported}; -use arrow::array::{make_builder, ArrayBuilder}; +use crate::row::{all_valid, get_offsets, supported, NullBitsFormatter}; +use arrow::array::*; use arrow::datatypes::{DataType, Schema}; use arrow::error::Result as ArrowResult; use arrow::record_batch::RecordBatch; +use arrow::util::bit_util::{ceil, get_bit_raw}; use std::sync::Arc; /// Read `data` of raw-bytes rows starting at `offsets` out to a record batch @@ -38,7 +38,7 @@ pub fn read_as_batch( for offset in offsets.iter().take(row_num) { row.point_to(*offset); - read_row(&row, &mut output, &schema)? + read_row(&row, &mut output, &schema); } output.output().map_err(DataFusionError::ArrowError) @@ -114,7 +114,7 @@ impl<'a> RowReader<'a> { pub fn new(schema: &Arc, data: &'a [u8]) -> Self { assert!(supported(schema)); let field_count = schema.fields().len(); - let null_width = null_width(field_count); + let null_width = ceil(field_count, 8); let (field_offsets, _) = get_offsets(null_width, schema); Self { data, @@ -148,7 +148,7 @@ impl<'a> RowReader<'a> { } fn is_valid_at(&self, idx: usize) -> bool { - get_bit(self.null_bits(), idx) + unsafe { get_bit_raw(self.null_bits().as_ptr(), idx) } } fn get_bool(&self, idx: usize) -> bool { @@ -238,11 +238,7 @@ impl<'a> RowReader<'a> { } } -fn read_row( - row: &RowReader, - batch: &mut MutableRecordBatch, - schema: &Arc, -) -> Result<()> { +fn read_row(row: &RowReader, batch: &mut MutableRecordBatch, schema: &Arc) { if row.all_valid() { for ((col_idx, to), field) in batch .arrays @@ -250,7 +246,7 @@ fn read_row( .enumerate() .zip(schema.fields().iter()) { - read_field_null_free(to, field.data_type(), col_idx, row)? + read_field_null_free(to, field.data_type(), col_idx, row) } } else { for ((col_idx, to), field) in batch @@ -259,10 +255,66 @@ fn read_row( .enumerate() .zip(schema.fields().iter()) { - read_field(to, field.data_type(), col_idx, row)? + read_field(to, field.data_type(), col_idx, row) } } - Ok(()) +} + +macro_rules! fn_read_field { + ($NATIVE: ident, $ARRAY: ident) => { + paste::item! { + fn [](to: &mut Box, col_idx: usize, row: &RowReader) { + let to = to + .as_any_mut() + .downcast_mut::<$ARRAY>() + .unwrap(); + to.append_option(row.[](col_idx)) + .map_err(DataFusionError::ArrowError) + .unwrap(); + } + + fn [](to: &mut Box, col_idx: usize, row: &RowReader) { + let to = to + .as_any_mut() + .downcast_mut::<$ARRAY>() + .unwrap(); + to.append_value(row.[](col_idx)) + .map_err(DataFusionError::ArrowError) + .unwrap(); + } + } + }; +} + +fn_read_field!(bool, BooleanBuilder); +fn_read_field!(u8, UInt8Builder); +fn_read_field!(u16, UInt16Builder); +fn_read_field!(u32, UInt32Builder); +fn_read_field!(u64, UInt64Builder); +fn_read_field!(i8, Int8Builder); +fn_read_field!(i16, Int16Builder); +fn_read_field!(i32, Int32Builder); +fn_read_field!(i64, Int64Builder); +fn_read_field!(f32, Float32Builder); +fn_read_field!(f64, Float64Builder); +fn_read_field!(date32, Date32Builder); +fn_read_field!(date64, Date64Builder); +fn_read_field!(utf8, StringBuilder); + +fn read_field_binary(to: &mut Box, col_idx: usize, row: &RowReader) { + let to = to.as_any_mut().downcast_mut::().unwrap(); + if row.is_valid_at(col_idx) { + to.append_value(row.get_binary(col_idx)).unwrap(); + } else { + to.append_null().unwrap(); + } +} + +fn read_field_binary_nf(to: &mut Box, col_idx: usize, row: &RowReader) { + let to = to.as_any_mut().downcast_mut::().unwrap(); + to.append_value(row.get_binary(col_idx)) + .map_err(DataFusionError::ArrowError) + .unwrap(); } fn read_field( @@ -270,77 +322,26 @@ fn read_field( dt: &DataType, col_idx: usize, row: &RowReader, -) -> Result<()> { - use arrow::array::*; +) { use DataType::*; match dt { - Boolean => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_bool_opt(col_idx))?; - } - UInt8 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_u8_opt(col_idx))?; - } - UInt16 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_u16_opt(col_idx))?; - } - UInt32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_u32_opt(col_idx))?; - } - UInt64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_u64_opt(col_idx))?; - } - Int8 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_i8_opt(col_idx))?; - } - Int16 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_i16_opt(col_idx))?; - } - Int32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_i32_opt(col_idx))?; - } - Int64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_i64_opt(col_idx))?; - } - Float32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_f32_opt(col_idx))?; - } - Float64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_f64_opt(col_idx))?; - } - Date32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_date32_opt(col_idx))?; - } - Date64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_date64_opt(col_idx))?; - } - Utf8 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_option(row.get_utf8_opt(col_idx))?; - } - Binary => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - if row.is_valid_at(col_idx) { - to.append_value(row.get_binary(col_idx))?; - } else { - to.append_null()?; - } - } + Boolean => read_field_bool(to, col_idx, row), + UInt8 => read_field_u8(to, col_idx, row), + UInt16 => read_field_u16(to, col_idx, row), + UInt32 => read_field_u32(to, col_idx, row), + UInt64 => read_field_u64(to, col_idx, row), + Int8 => read_field_i8(to, col_idx, row), + Int16 => read_field_i16(to, col_idx, row), + Int32 => read_field_i32(to, col_idx, row), + Int64 => read_field_i64(to, col_idx, row), + Float32 => read_field_f32(to, col_idx, row), + Float64 => read_field_f64(to, col_idx, row), + Date32 => read_field_date32(to, col_idx, row), + Date64 => read_field_date64(to, col_idx, row), + Utf8 => read_field_utf8(to, col_idx, row), + Binary => read_field_binary(to, col_idx, row), _ => unimplemented!(), } - Ok(()) } fn read_field_null_free( @@ -348,73 +349,26 @@ fn read_field_null_free( dt: &DataType, col_idx: usize, row: &RowReader, -) -> Result<()> { - use arrow::array::*; +) { use DataType::*; match dt { - Boolean => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_bool(col_idx))?; - } - UInt8 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_u8(col_idx))?; - } - UInt16 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_u16(col_idx))?; - } - UInt32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_u32(col_idx))?; - } - UInt64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_u64(col_idx))?; - } - Int8 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_i8(col_idx))?; - } - Int16 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_i16(col_idx))?; - } - Int32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_i32(col_idx))?; - } - Int64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_i64(col_idx))?; - } - Float32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_f32(col_idx))?; - } - Float64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_f64(col_idx))?; - } - Date32 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_date32(col_idx))?; - } - Date64 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_date64(col_idx))?; - } - Utf8 => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_utf8(col_idx))?; - } - Binary => { - let to = to.as_any_mut().downcast_mut::().unwrap(); - to.append_value(row.get_binary(col_idx))?; - } + Boolean => read_field_bool_nf(to, col_idx, row), + UInt8 => read_field_u8_nf(to, col_idx, row), + UInt16 => read_field_u16_nf(to, col_idx, row), + UInt32 => read_field_u32_nf(to, col_idx, row), + UInt64 => read_field_u64_nf(to, col_idx, row), + Int8 => read_field_i8_nf(to, col_idx, row), + Int16 => read_field_i16_nf(to, col_idx, row), + Int32 => read_field_i32_nf(to, col_idx, row), + Int64 => read_field_i64_nf(to, col_idx, row), + Float32 => read_field_f32_nf(to, col_idx, row), + Float64 => read_field_f64_nf(to, col_idx, row), + Date32 => read_field_date32_nf(to, col_idx, row), + Date64 => read_field_date64_nf(to, col_idx, row), + Utf8 => read_field_utf8_nf(to, col_idx, row), + Binary => read_field_binary_nf(to, col_idx, row), _ => unimplemented!(), } - Ok(()) } struct MutableRecordBatch { diff --git a/datafusion/src/row/writer.rs b/datafusion/src/row/writer.rs index e0421b057daa..698f7974c10a 100644 --- a/datafusion/src/row/writer.rs +++ b/datafusion/src/row/writer.rs @@ -17,11 +17,11 @@ //! Reusable row writer backed by Vec to stitch attributes together -use crate::row::bitmap::{align_word, null_width, set_bit, unset_bit}; use crate::row::{estimate_row_width, fixed_size, get_offsets, supported}; use arrow::array::Array; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; +use arrow::util::bit_util::{ceil, round_upto_power_of_2, set_bit_raw, unset_bit_raw}; use std::cmp::max; use std::sync::Arc; @@ -97,7 +97,7 @@ impl RowWriter { pub fn new(schema: &Arc) -> Self { assert!(supported(schema)); let field_count = schema.fields().len(); - let null_width = null_width(field_count); + let null_width = ceil(field_count, 8); let (field_offsets, values_width) = get_offsets(null_width, schema); let mut init_capacity = estimate_row_width(null_width, schema); if !fixed_size(schema) { @@ -131,12 +131,16 @@ impl RowWriter { fn set_null_at(&mut self, idx: usize) { let null_bits = &mut self.data[0..self.null_width]; - unset_bit(null_bits, idx) + unsafe { + unset_bit_raw(null_bits.as_mut_ptr(), idx); + } } fn set_non_null_at(&mut self, idx: usize) { let null_bits = &mut self.data[0..self.null_width]; - set_bit(null_bits, idx) + unsafe { + set_bit_raw(null_bits.as_mut_ptr(), idx); + } } fn set_bool(&mut self, idx: usize, value: bool) { @@ -207,7 +211,7 @@ impl RowWriter { /// End each row at 8-byte word boundary. fn end_padding(&mut self) { let payload_width = self.current_width(); - self.row_width = align_word(payload_width); + self.row_width = round_upto_power_of_2(payload_width, 8); if self.data.capacity() < self.row_width { self.data.resize(self.row_width, 0); }