Skip to content

Commit

Permalink
ArrayData Enumeration for Primitive, Binary and UTF8 (#3749)
Browse files Browse the repository at this point in the history
* Add BooleanBuffer

* Add NullBuffer

* Add PrimitiveArrayData

* Add BytesArrayData

* Move module

* Make private for now

* Move NullBuffer to arrow-buffer

* Format

* More docs

* Seal traits

* Doc

* Review feedback
  • Loading branch information
tustvold authored Feb 27, 2023
1 parent 96791ea commit dae7a71
Show file tree
Hide file tree
Showing 7 changed files with 812 additions and 2 deletions.
84 changes: 84 additions & 0 deletions arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::{bit_util, Buffer};

/// A slice-able [`Buffer`] containing bit-packed booleans
#[derive(Debug, Clone)]
pub struct BooleanBuffer {
buffer: Buffer,
offset: usize,
len: usize,
}

impl BooleanBuffer {
/// Create a new [`BooleanBuffer`] from a [`Buffer`], an `offset` and `length` in bits
///
/// # Panics
///
/// This method will panic if `buffer` is not large enough
pub fn new(buffer: Buffer, offset: usize, len: usize) -> Self {
let total_len = offset.saturating_add(len);
let bit_len = buffer.len().saturating_mul(8);
assert!(total_len <= bit_len);
Self {
buffer,
offset,
len,
}
}

/// Returns the number of set bits in this buffer
pub fn count_set_bits(&self) -> usize {
self.buffer.count_set_bits_offset(self.offset, self.len)
}

/// Returns `true` if the bit at index `i` is set
///
/// # Panics
///
/// Panics if `i >= self.len()`
#[inline]
pub fn is_set(&self, i: usize) -> bool {
assert!(i < self.len);
unsafe { bit_util::get_bit_raw(self.buffer.as_ptr(), i + self.offset) }
}

/// Returns the offset of this [`BooleanBuffer`] in bits
#[inline]
pub fn offset(&self) -> usize {
self.offset
}

/// Returns the length of this [`BooleanBuffer`] in bits
#[inline]
pub fn len(&self) -> usize {
self.len
}

/// Returns true if this [`BooleanBuffer`] is empty
#[inline]
pub fn is_empty(&self) -> bool {
self.len == 0
}

/// Returns the packed values of this [`BooleanBuffer`] not including any offset
#[inline]
pub fn values(&self) -> &[u8] {
&self.buffer
}
}
7 changes: 5 additions & 2 deletions arrow-buffer/src/buffer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ pub use immutable::*;
mod mutable;
pub use mutable::*;
mod ops;
pub use ops::*;
mod scalar;
pub use scalar::*;

pub use ops::*;
mod boolean;
pub use boolean::*;
mod null;
pub use null::*;
90 changes: 90 additions & 0 deletions arrow-buffer/src/buffer/null.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::buffer::BooleanBuffer;

#[derive(Debug, Clone)]
pub struct NullBuffer {
buffer: BooleanBuffer,
null_count: usize,
}

impl NullBuffer {
/// Create a new [`NullBuffer`] computing the null count
pub fn new(buffer: BooleanBuffer) -> Self {
let null_count = buffer.len() - buffer.count_set_bits();
Self { buffer, null_count }
}

/// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
///
/// # Safety
///
/// `buffer` must contain `null_count` `0` bits
pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
Self { buffer, null_count }
}

/// Returns the length of this [`NullBuffer`]
#[inline]
pub fn len(&self) -> usize {
self.buffer.len()
}

/// Returns true if this [`NullBuffer`] is empty
#[inline]
pub fn is_empty(&self) -> bool {
self.buffer.is_empty()
}

/// Returns the null count for this [`NullBuffer`]
#[inline]
pub fn null_count(&self) -> usize {
self.null_count
}

/// Returns `true` if the value at `idx` is not null
#[inline]
pub fn is_valid(&self, idx: usize) -> bool {
self.buffer.is_set(idx)
}

/// Returns `true` if the value at `idx` is null
#[inline]
pub fn is_null(&self, idx: usize) -> bool {
!self.is_valid(idx)
}

/// Returns the inner buffer
#[inline]
pub fn inner(&self) -> &BooleanBuffer {
&self.buffer
}
}

#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_size() {
// This tests that the niche optimisation eliminates the overhead of an option
assert_eq!(
std::mem::size_of::<NullBuffer>(),
std::mem::size_of::<Option<NullBuffer>>()
);
}
}
Loading

0 comments on commit dae7a71

Please sign in to comment.