pola-rs · ritchie46 · Jun 1, 2023 · Jun 1, 2023
@@ -2,7 +2,6 @@ use std::convert::TryFrom;
 
 use arrow::array::*;
 use arrow::bitmap::{Bitmap, MutableBitmap};
-use arrow::offset::OffsetsBuffer;
 use polars_arrow::array::list::AnonymousBuilder;
 use polars_arrow::array::PolarsArray;
 use polars_arrow::bit_util::unset_bit_raw;
@@ -415,193 +414,6 @@ pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec<IdxSiz
     idx
 }
 
-impl ChunkExplode for ListChunked {
-    fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
-        // A list array's memory layout is actually already 'exploded', so we can just take the values array
-        // of the list. And we also return a slice of the offsets. This slice can be used to find the old
-        // list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
-        let ca = self.rechunk();
-        let listarr: &LargeListArray = ca
-            .downcast_iter()
-            .next()
-            .ok_or_else(|| polars_err!(NoData: "cannot explode empty list"))?;
-        let offsets_buf = listarr.offsets().clone();
-        let offsets = listarr.offsets().as_slice();
-        let mut values = listarr.values().clone();
-
-        let mut s = if ca._can_fast_explode() {
-            // ensure that the value array is sliced
-            // as a list only slices its offsets on a slice operation
-
-            // we only do this in fast-explode as for the other
-            // branch the offsets must coincide with the values.
-            if !offsets.is_empty() {
-                let start = offsets[0] as usize;
-                let len = offsets[offsets.len() - 1] as usize - start;
-                // safety:
-                // we are in bounds
-                values = unsafe { values.sliced_unchecked(start, len) };
-            }
-            // safety: inner_dtype should be correct
-            unsafe {
-                Series::from_chunks_and_dtype_unchecked(
-                    self.name(),
-                    vec![values],
-                    &self.inner_dtype().to_physical(),
-                )
-            }
-        } else {
-            // during tests
-            // test that this code branch is not hit with list arrays that could be fast exploded
-            #[cfg(test)]
-            {
-                let mut last = offsets[0];
-                let mut has_empty = false;
-                for &o in &offsets[1..] {
-                    if o == last {
-                        has_empty = true;
-                    }
-                    last = o;
-                }
-                if !has_empty && offsets[0] == 0 {
-                    panic!("could have fast exploded")
-                }
-            }
-
-            // safety: inner_dtype should be correct
-            let values = unsafe {
-                Series::from_chunks_and_dtype_unchecked(
-                    self.name(),
-                    vec![values],
-                    &self.inner_dtype().to_physical(),
-                )
-            };
-            values.explode_by_offsets(offsets)
-        };
-        debug_assert_eq!(s.name(), self.name());
-        // restore logical type
-        unsafe {
-            s = s.cast_unchecked(&self.inner_dtype()).unwrap();
-        }
-
-        Ok((s, offsets_buf))
-    }
-}
-
-impl ChunkExplode for Utf8Chunked {
-    fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
-        // A list array's memory layout is actually already 'exploded', so we can just take the values array
-        // of the list. And we also return a slice of the offsets. This slice can be used to find the old
-        // list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
-        let ca = self.rechunk();
-        let array: &Utf8Array<i64> = ca
-            .downcast_iter()
-            .next()
-            .ok_or_else(|| polars_err!(NoData: "cannot explode empty str"))?;
-
-        let values = array.values();
-        let old_offsets = array.offsets().clone();
-
-        let (new_offsets, validity) = if let Some(validity) = array.validity() {
-            // capacity estimate
-            let capacity = self.get_values_size() + validity.unset_bits();
-
-            let old_offsets = old_offsets.as_slice();
-            let mut old_offset = old_offsets[0];
-            let mut new_offsets = Vec::with_capacity(capacity + 1);
-            new_offsets.push(old_offset);
-
-            let mut bitmap = MutableBitmap::with_capacity(capacity);
-            let values = values.as_slice();
-            for (&offset, valid) in old_offsets[1..].iter().zip(validity) {
-                // safety:
-                // new_offsets already has a single value, so -1 is always in bounds
-                let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };
-
-                if valid {
-                    debug_assert!(old_offset as usize <= values.len());
-                    debug_assert!(offset as usize <= values.len());
-                    let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };
-
-                    // take the string value and find the char offsets
-                    // create a new offset value for each char boundary
-                    // safety:
-                    // we know we have string data.
-                    let str_val = unsafe { std::str::from_utf8_unchecked(val) };
-
-                    let char_offsets = str_val
-                        .char_indices()
-                        .skip(1)
-                        .map(|t| t.0 as i64 + latest_offset);
-
-                    // extend the chars
-                    // also keep track of the amount of offsets added
-                    // as we must update the validity bitmap
-                    let len_before = new_offsets.len();
-                    new_offsets.extend(char_offsets);
-                    new_offsets.push(latest_offset + str_val.len() as i64);
-                    bitmap.extend_constant(new_offsets.len() - len_before, true);
-                } else {
-                    // no data, just add old offset and set null bit
-                    new_offsets.push(latest_offset);
-                    bitmap.push(false)
-                }
-                old_offset = offset;
-            }
-
-            (new_offsets.into(), bitmap.into())
-        } else {
-            // fast(er) explode
-
-            // we cannot naively explode, because there might be empty strings.
-
-            // capacity estimate
-            let capacity = self.get_values_size();
-            let old_offsets = old_offsets.as_slice();
-            let mut old_offset = old_offsets[0];
-            let mut new_offsets = Vec::with_capacity(capacity + 1);
-            new_offsets.push(old_offset);
-
-            let values = values.as_slice();
-            for &offset in &old_offsets[1..] {
-                // safety:
-                // new_offsets already has a single value, so -1 is always in bounds
-                let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };
-                debug_assert!(old_offset as usize <= values.len());
-                debug_assert!(offset as usize <= values.len());
-                let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };
-
-                // take the string value and find the char offsets
-                // create a new offset value for each char boundary
-                // safety:
-                // we know we have string data.
-                let str_val = unsafe { std::str::from_utf8_unchecked(val) };
-
-                let char_offsets = str_val
-                    .char_indices()
-                    .skip(1)
-                    .map(|t| t.0 as i64 + latest_offset);
-
-                // extend the chars
-                new_offsets.extend(char_offsets);
-                new_offsets.push(latest_offset + str_val.len() as i64);
-                old_offset = offset;
-            }
-
-            (new_offsets.into(), None)
-        };
-
-        let array = unsafe {
-            Utf8Array::<i64>::from_data_unchecked_default(new_offsets, values.clone(), validity)
-        };
-
-        let new_arr = Box::new(array) as ArrayRef;
-
-        let s = Series::try_from((self.name(), new_arr)).unwrap();
-        Ok((s, old_offsets))
-    }
-}
-
 #[cfg(test)]
 mod test {
     use super::*;