Skip to content

Commit

Permalink
feat: new implementation for String/Binary type. (pola-rs#13748)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored and r-brink committed Jan 24, 2024
1 parent ad75f91 commit fa1bfa1
Show file tree
Hide file tree
Showing 145 changed files with 2,691 additions and 2,864 deletions.
76 changes: 67 additions & 9 deletions crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,22 @@ mod private {
impl Sealed for str {}
impl Sealed for [u8] {}
}
pub use iterator::BinaryViewValueIter;
pub use mutable::MutableBinaryViewArray;
use private::Sealed;

use crate::array::binview::iterator::BinaryViewValueIter;
use crate::array::binview::view::{
validate_binary_view, validate_utf8_only_view, validate_utf8_view,
};
use crate::array::binview::view::{validate_binary_view, validate_utf8_only, validate_utf8_view};
use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::utils::{BitmapIter, ZipValidity};

pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;

pub type MutablePlString = MutableBinaryViewArray<str>;
pub type MutablePlBinary = MutableBinaryViewArray<[u8]>;

static BIN_VIEW_TYPE: ArrowDataType = ArrowDataType::BinaryView;
static UTF8_VIEW_TYPE: ArrowDataType = ArrowDataType::Utf8View;

pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
const IS_UTF8: bool;
const DATA_TYPE: ArrowDataType;
Expand All @@ -49,6 +52,8 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {

#[allow(clippy::wrong_self_convention)]
fn into_owned(&self) -> Self::Owned;

fn dtype() -> &'static ArrowDataType;
}

impl ViewType for str {
Expand All @@ -69,6 +74,9 @@ impl ViewType for str {
fn into_owned(&self) -> Self::Owned {
self.to_string()
}
fn dtype() -> &'static ArrowDataType {
&UTF8_VIEW_TYPE
}
}

impl ViewType for [u8] {
Expand All @@ -89,6 +97,10 @@ impl ViewType for [u8] {
fn into_owned(&self) -> Self::Owned {
self.to_vec()
}

fn dtype() -> &'static ArrowDataType {
&BIN_VIEW_TYPE
}
}

pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
Expand All @@ -105,6 +117,12 @@ pub struct BinaryViewArrayGeneric<T: ViewType + ?Sized> {
total_buffer_len: usize,
}

impl<T: ViewType + ?Sized> PartialEq for BinaryViewArrayGeneric<T> {
fn eq(&self, other: &Self) -> bool {
self.into_iter().zip(other).all(|(l, r)| l == r)
}
}

impl<T: ViewType + ?Sized> Clone for BinaryViewArrayGeneric<T> {
fn clone(&self) -> Self {
Self {
Expand Down Expand Up @@ -262,7 +280,7 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
// data: 12 bytes

let bytes = if len <= 12 {
let ptr = self.views.storage_ptr() as *const u8;
let ptr = self.views.as_ptr() as *const u8;
std::slice::from_raw_parts(ptr.add(i * 16 + 4), len as usize)
} else {
let buffer_idx = (v >> 64) as u32;
Expand All @@ -285,6 +303,10 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
BinaryViewValueIter::new(self)
}

pub fn len_iter(&self) -> impl Iterator<Item = u32> + '_ {
self.views.iter().map(|v| *v as u32)
}

/// Returns an iterator of the non-null values.
pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryViewArrayGeneric<T>> {
NonNullValuesIter::new(self, self.validity())
Expand All @@ -299,13 +321,19 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
impl_mut_validity!();
impl_into_array!();

pub fn from<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {
pub fn from_slice<S: AsRef<T>, P: AsRef<[Option<S>]>>(slice: P) -> Self {
let mutable = MutableBinaryViewArray::from_iterator(
slice.as_ref().iter().map(|opt_v| opt_v.as_ref()),
);
mutable.into()
}

pub fn from_slice_values<S: AsRef<T>, P: AsRef<[S]>>(slice: P) -> Self {
let mutable =
MutableBinaryViewArray::from_values_iter(slice.as_ref().iter().map(|v| v.as_ref()));
mutable.into()
}

/// Get the total length of bytes that it would take to concatenate all binary/str values in this array.
pub fn total_bytes_len(&self) -> usize {
self.total_bytes_len
Expand All @@ -320,12 +348,40 @@ impl<T: ViewType + ?Sized> BinaryViewArrayGeneric<T> {
pub fn len(&self) -> usize {
self.views.len()
}

/// Garbage collect
pub fn gc(self) -> Self {
if self.buffers.is_empty() {
return self;
}
let mut mutable = MutableBinaryViewArray::with_capacity(self.len());
let buffers = self.raw_buffers.as_ref();

for view in self.views.as_ref() {
unsafe { mutable.push_view(*view, buffers) }
}
mutable.freeze().with_validity(self.validity)
}

pub fn maybe_gc(self) -> Self {
if self.total_buffer_len == 0 {
return self;
}
// Subtract the maximum amount of inlined strings.
let min_in_buffer = self.total_bytes_len.saturating_sub(self.len() * 12);
let frac = (min_in_buffer as f64) / ((self.total_buffer_len() + 1) as f64);

if frac < 0.25 {
return self.gc();
}
self
}
}

impl BinaryViewArray {
/// Validate the underlying bytes on UTF-8.
pub fn validate_utf8(&self) -> PolarsResult<()> {
validate_utf8_only_view(&self.views, &self.buffers)
validate_utf8_only(&self.views, &self.buffers)
}

/// Convert [`BinaryViewArray`] to [`Utf8ViewArray`].
Expand Down Expand Up @@ -381,7 +437,7 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
}

fn data_type(&self) -> &ArrowDataType {
&self.data_type
T::dtype()
}

fn validity(&self) -> Option<&Bitmap> {
Expand All @@ -397,12 +453,14 @@ impl<T: ViewType + ?Sized> Array for BinaryViewArrayGeneric<T> {
}

unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
debug_assert!(offset + length <= self.len());
self.validity = self
.validity
.take()
.map(|bitmap| bitmap.sliced_unchecked(offset, length))
.filter(|bitmap| bitmap.unset_bits() > 0);
self.views.slice_unchecked(offset, length);
self.total_bytes_len = self.len_iter().map(|v| v as usize).sum::<usize>();
}

fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Expand Down
Loading

0 comments on commit fa1bfa1

Please sign in to comment.