Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust, python): implement explode for DataType::Array #9157

Merged
merged 1 commit into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 0 additions & 188 deletions polars/polars-core/src/chunked_array/ops/explode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use std::convert::TryFrom;

use arrow::array::*;
use arrow::bitmap::{Bitmap, MutableBitmap};
use arrow::offset::OffsetsBuffer;
use polars_arrow::array::list::AnonymousBuilder;
use polars_arrow::array::PolarsArray;
use polars_arrow::bit_util::unset_bit_raw;
Expand Down Expand Up @@ -415,193 +414,6 @@ pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec<IdxSiz
idx
}

impl ChunkExplode for ListChunked {
fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
// A list array's memory layout is actually already 'exploded', so we can just take the values array
// of the list. And we also return a slice of the offsets. This slice can be used to find the old
// list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
let ca = self.rechunk();
let listarr: &LargeListArray = ca
.downcast_iter()
.next()
.ok_or_else(|| polars_err!(NoData: "cannot explode empty list"))?;
let offsets_buf = listarr.offsets().clone();
let offsets = listarr.offsets().as_slice();
let mut values = listarr.values().clone();

let mut s = if ca._can_fast_explode() {
// ensure that the value array is sliced
// as a list only slices its offsets on a slice operation

// we only do this in fast-explode as for the other
// branch the offsets must coincide with the values.
if !offsets.is_empty() {
let start = offsets[0] as usize;
let len = offsets[offsets.len() - 1] as usize - start;
// safety:
// we are in bounds
values = unsafe { values.sliced_unchecked(start, len) };
}
// safety: inner_dtype should be correct
unsafe {
Series::from_chunks_and_dtype_unchecked(
self.name(),
vec![values],
&self.inner_dtype().to_physical(),
)
}
} else {
// during tests
// test that this code branch is not hit with list arrays that could be fast exploded
#[cfg(test)]
{
let mut last = offsets[0];
let mut has_empty = false;
for &o in &offsets[1..] {
if o == last {
has_empty = true;
}
last = o;
}
if !has_empty && offsets[0] == 0 {
panic!("could have fast exploded")
}
}

// safety: inner_dtype should be correct
let values = unsafe {
Series::from_chunks_and_dtype_unchecked(
self.name(),
vec![values],
&self.inner_dtype().to_physical(),
)
};
values.explode_by_offsets(offsets)
};
debug_assert_eq!(s.name(), self.name());
// restore logical type
unsafe {
s = s.cast_unchecked(&self.inner_dtype()).unwrap();
}

Ok((s, offsets_buf))
}
}

impl ChunkExplode for Utf8Chunked {
fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
// A list array's memory layout is actually already 'exploded', so we can just take the values array
// of the list. And we also return a slice of the offsets. This slice can be used to find the old
// list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
let ca = self.rechunk();
let array: &Utf8Array<i64> = ca
.downcast_iter()
.next()
.ok_or_else(|| polars_err!(NoData: "cannot explode empty str"))?;

let values = array.values();
let old_offsets = array.offsets().clone();

let (new_offsets, validity) = if let Some(validity) = array.validity() {
// capacity estimate
let capacity = self.get_values_size() + validity.unset_bits();

let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let mut bitmap = MutableBitmap::with_capacity(capacity);
let values = values.as_slice();
for (&offset, valid) in old_offsets[1..].iter().zip(validity) {
// safety:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };

if valid {
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// safety:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
// also keep track of the amount of offsets added
// as we must update the validity bitmap
let len_before = new_offsets.len();
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
bitmap.extend_constant(new_offsets.len() - len_before, true);
} else {
// no data, just add old offset and set null bit
new_offsets.push(latest_offset);
bitmap.push(false)
}
old_offset = offset;
}

(new_offsets.into(), bitmap.into())
} else {
// fast(er) explode

// we cannot naively explode, because there might be empty strings.

// capacity estimate
let capacity = self.get_values_size();
let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let values = values.as_slice();
for &offset in &old_offsets[1..] {
// safety:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// safety:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
old_offset = offset;
}

(new_offsets.into(), None)
};

let array = unsafe {
Utf8Array::<i64>::from_data_unchecked_default(new_offsets, values.clone(), validity)
};

let new_arr = Box::new(array) as ArrayRef;

let s = Series::try_from((self.name(), new_arr)).unwrap();
Ok((s, old_offsets))
}
}

#[cfg(test)]
mod test {
use super::*;
Expand Down
Loading