diff --git a/crates/polars-compute/src/min_max/dyn_array.rs b/crates/polars-compute/src/min_max/dyn_array.rs index 64883aff2ae0..e988bbd0ef54 100644 --- a/crates/polars-compute/src/min_max/dyn_array.rs +++ b/crates/polars-compute/src/min_max/dyn_array.rs @@ -15,37 +15,55 @@ macro_rules! call_op { $op(arr) .map(|v| Box::new(<$scalar>::new(arr.data_type().clone(), Some(v))) as Box) }}; + ($T:ty, $scalar:ty, $arr:expr, $op:path, ret_two) => {{ + let arr: &$T = $arr.as_any().downcast_ref().unwrap(); + $op(arr).map(|(l, r)| { + ( + Box::new(<$scalar>::new(Some(l))) as Box, + Box::new(<$scalar>::new(Some(r))) as Box, + ) + }) + }}; + (dt: $T:ty, $scalar:ty, $arr:expr, $op:path, ret_two) => {{ + let arr: &$T = $arr.as_any().downcast_ref().unwrap(); + $op(arr).map(|(l, r)| { + ( + Box::new(<$scalar>::new(arr.data_type().clone(), Some(l))) as Box, + Box::new(<$scalar>::new(arr.data_type().clone(), Some(r))) as Box, + ) + }) + }}; } macro_rules! call { - ($arr:expr, $op:path) => {{ + ($arr:expr, $op:path$(, $variant:ident)?) => {{ let arr = $arr; use arrow::datatypes::{PhysicalType as PH, PrimitiveType as PR}; use PrimitiveArray as PArr; use PrimitiveScalar as PScalar; match arr.data_type().to_physical_type() { - PH::Boolean => call_op!(BooleanArray, BooleanScalar, arr, $op), - PH::Primitive(PR::Int8) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::Int16) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::Int32) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::Int64) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::Int128) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::UInt8) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::UInt16) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::UInt32) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::UInt64) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::UInt128) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::Float32) => call_op!(dt: PArr, PScalar, arr, $op), - PH::Primitive(PR::Float64) => call_op!(dt: PArr, PScalar, arr, $op), + PH::Boolean => call_op!(BooleanArray, BooleanScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::Int8) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::Int16) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::Int32) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::Int64) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::Int128) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::UInt8) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::UInt16) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::UInt32) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::UInt64) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::UInt128) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::Float32) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), + PH::Primitive(PR::Float64) => call_op!(dt: PArr, PScalar, arr, $op$(, $variant)?), - PH::BinaryView => call_op!(BinaryViewArray, BinaryViewScalar<[u8]>, arr, $op), - PH::Utf8View => call_op!(Utf8ViewArray, BinaryViewScalar, arr, $op), + PH::BinaryView => call_op!(BinaryViewArray, BinaryViewScalar<[u8]>, arr, $op$(, $variant)?), + PH::Utf8View => call_op!(Utf8ViewArray, BinaryViewScalar, arr, $op$(, $variant)?), - PH::Binary => call_op!(BinaryArray, BinaryScalar, arr, $op), - PH::LargeBinary => call_op!(BinaryArray, BinaryScalar, arr, $op), - PH::Utf8 => call_op!(Utf8Array, BinaryScalar, arr, $op), - PH::LargeUtf8 => call_op!(Utf8Array, BinaryScalar, arr, $op), + PH::Binary => call_op!(BinaryArray, BinaryScalar, arr, $op$(, $variant)?), + PH::LargeBinary => call_op!(BinaryArray, BinaryScalar, arr, $op$(, $variant)?), + PH::Utf8 => call_op!(Utf8Array, BinaryScalar, arr, $op$(, $variant)?), + PH::LargeUtf8 => call_op!(Utf8Array, BinaryScalar, arr, $op$(, $variant)?), _ => todo!("Dynamic MinMax is not yet implemented for {:?}", arr.data_type()), } @@ -67,3 +85,9 @@ pub fn dyn_array_min_propagate_nan(arr: &dyn Array) -> Option> { pub fn dyn_array_max_propagate_nan(arr: &dyn Array) -> Option> { call!(arr, MinMaxKernel::max_propagate_nan_kernel) } + +pub fn dyn_array_min_max_propagate_nan( + arr: &dyn Array, +) -> Option<(Box, Box)> { + call!(arr, MinMaxKernel::min_max_propagate_nan_kernel, ret_two) +} diff --git a/crates/polars-compute/src/min_max/mod.rs b/crates/polars-compute/src/min_max/mod.rs index 32b22f06b84f..0ffb15499449 100644 --- a/crates/polars-compute/src/min_max/mod.rs +++ b/crates/polars-compute/src/min_max/mod.rs @@ -2,7 +2,7 @@ use polars_utils::min_max::MinMax; pub use self::dyn_array::{ dyn_array_max_ignore_nan, dyn_array_max_propagate_nan, dyn_array_min_ignore_nan, - dyn_array_min_propagate_nan, + dyn_array_min_max_propagate_nan, dyn_array_min_propagate_nan, }; /// Low-level min/max kernel. diff --git a/crates/polars-compute/src/min_max/scalar.rs b/crates/polars-compute/src/min_max/scalar.rs index 5a14fc571a08..8134b53daabd 100644 --- a/crates/polars-compute/src/min_max/scalar.rs +++ b/crates/polars-compute/src/min_max/scalar.rs @@ -6,6 +6,20 @@ use polars_utils::min_max::MinMax; use super::MinMaxKernel; +fn min_max_ignore_nan((cur_min, cur_max): (T, T), (min, max): (T, T)) -> (T, T) { + ( + MinMax::min_ignore_nan(cur_min, min), + MinMax::max_ignore_nan(cur_max, max), + ) +} + +fn min_max_propagate_nan((cur_min, cur_max): (T, T), (min, max): (T, T)) -> (T, T) { + ( + MinMax::min_propagate_nan(cur_min, min), + MinMax::max_propagate_nan(cur_max, max), + ) +} + fn reduce_vals(v: &PrimitiveArray, f: F) -> Option where T: NativeType, @@ -18,6 +32,18 @@ where } } +fn reduce_tuple_vals(v: &PrimitiveArray, f: F) -> Option<(T, T)> +where + T: NativeType, + F: Fn((T, T), (T, T)) -> (T, T), +{ + if v.null_count() == 0 { + v.values_iter().copied().map(|v| (v, v)).reduce(f) + } else { + v.non_null_values_iter().map(|v| (v, v)).reduce(f) + } +} + impl MinMaxKernel for PrimitiveArray { type Scalar<'a> = T; @@ -29,6 +55,10 @@ impl MinMaxKernel for Primitiv reduce_vals(self, MinMax::max_ignore_nan) } + fn min_max_ignore_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + reduce_tuple_vals(self, min_max_ignore_nan) + } + fn min_propagate_nan_kernel(&self) -> Option> { reduce_vals(self, MinMax::min_propagate_nan) } @@ -36,6 +66,10 @@ impl MinMaxKernel for Primitiv fn max_propagate_nan_kernel(&self) -> Option> { reduce_vals(self, MinMax::max_propagate_nan) } + + fn min_max_propagate_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + reduce_tuple_vals(self, min_max_propagate_nan) + } } impl MinMaxKernel for [T] { @@ -49,6 +83,13 @@ impl MinMaxKernel for [T] { self.iter().copied().reduce(MinMax::max_ignore_nan) } + fn min_max_ignore_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + self.iter() + .copied() + .map(|v| (v, v)) + .reduce(min_max_ignore_nan) + } + fn min_propagate_nan_kernel(&self) -> Option> { self.iter().copied().reduce(MinMax::min_propagate_nan) } @@ -56,6 +97,13 @@ impl MinMaxKernel for [T] { fn max_propagate_nan_kernel(&self) -> Option> { self.iter().copied().reduce(MinMax::max_propagate_nan) } + + fn min_max_propagate_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + self.iter() + .copied() + .map(|v| (v, v)) + .reduce(min_max_propagate_nan) + } } impl MinMaxKernel for BooleanArray { diff --git a/crates/polars-compute/src/min_max/simd.rs b/crates/polars-compute/src/min_max/simd.rs index 8c25725b618b..87ce193e0b4c 100644 --- a/crates/polars-compute/src/min_max/simd.rs +++ b/crates/polars-compute/src/min_max/simd.rs @@ -72,6 +72,87 @@ where Some(state) } +fn fold_agg_min_max_kernel( + arr: &[T], + validity: Option<&Bitmap>, + min_scalar_identity: T, + max_scalar_identity: T, + mut simd_f: F, +) -> Option<(Simd, Simd)> +where + T: SimdElement + NativeType, + F: FnMut((Simd, Simd), (Simd, Simd)) -> (Simd, Simd), + LaneCount: SupportedLaneCount, +{ + if arr.is_empty() { + return None; + } + + let mut arr_chunks = arr.chunks_exact(N); + + let min_identity = Simd::splat(min_scalar_identity); + let max_identity = Simd::splat(max_scalar_identity); + let mut state = (min_identity, max_identity); + if let Some(valid) = validity { + if valid.unset_bits() == arr.len() { + return None; + } + + let mask = BitMask::from_bitmap(valid); + let mut offset = 0; + for c in arr_chunks.by_ref() { + let m: Mask<_, N> = mask.get_simd(offset); + let slice = Simd::from_slice(c); + state = simd_f( + state, + (m.select(slice, min_identity), m.select(slice, max_identity)), + ); + offset += N; + } + if arr.len() % N > 0 { + let mut min_rest: [T; N] = min_identity.to_array(); + let mut max_rest: [T; N] = max_identity.to_array(); + + let arr_rest = arr_chunks.remainder(); + min_rest[..arr_rest.len()].copy_from_slice(arr_rest); + max_rest[..arr_rest.len()].copy_from_slice(arr_rest); + + let m: Mask<_, N> = mask.get_simd(offset); + + let min_rest = Simd::from_array(min_rest); + let max_rest = Simd::from_array(max_rest); + + state = simd_f( + state, + ( + m.select(min_rest, min_identity), + m.select(max_rest, max_identity), + ), + ); + } + } else { + for c in arr_chunks.by_ref() { + let slice = Simd::from_slice(c); + state = simd_f(state, (slice, slice)); + } + if arr.len() % N > 0 { + let mut min_rest: [T; N] = min_identity.to_array(); + let mut max_rest: [T; N] = max_identity.to_array(); + + let arr_rest = arr_chunks.remainder(); + min_rest[..arr_rest.len()].copy_from_slice(arr_rest); + max_rest[..arr_rest.len()].copy_from_slice(arr_rest); + + let min_rest = Simd::from_array(min_rest); + let max_rest = Simd::from_array(max_rest); + + state = simd_f(state, (min_rest, max_rest)); + } + } + + Some(state) +} + macro_rules! impl_min_max_kernel_int { ($T:ty, $N:literal) => { impl MinMaxKernel for PrimitiveArray<$T> { @@ -91,6 +172,17 @@ macro_rules! impl_min_max_kernel_int { .map(|s| s.reduce_max()) } + fn min_max_ignore_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + fold_agg_min_max_kernel::<$N, $T, _>( + self.values(), + self.validity(), + <$T>::MAX, + <$T>::MIN, + |(cmin, cmax), (min, max)| (cmin.simd_min(min), cmax.simd_max(max)), + ) + .map(|(min, max)| (min.reduce_min(), max.reduce_max())) + } + fn min_propagate_nan_kernel(&self) -> Option> { self.min_ignore_nan_kernel() } @@ -98,6 +190,10 @@ macro_rules! impl_min_max_kernel_int { fn max_propagate_nan_kernel(&self) -> Option> { self.max_ignore_nan_kernel() } + + fn min_max_propagate_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + self.min_max_ignore_nan_kernel() + } } impl MinMaxKernel for [$T] { @@ -113,6 +209,17 @@ macro_rules! impl_min_max_kernel_int { .map(|s| s.reduce_max()) } + fn min_max_ignore_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + fold_agg_min_max_kernel::<$N, $T, _>( + self, + None, + <$T>::MAX, + <$T>::MIN, + |(cmin, cmax), (min, max)| (cmin.simd_min(min), cmax.simd_max(max)), + ) + .map(|(min, max)| (min.reduce_min(), max.reduce_max())) + } + fn min_propagate_nan_kernel(&self) -> Option> { self.min_ignore_nan_kernel() } @@ -120,6 +227,10 @@ macro_rules! impl_min_max_kernel_int { fn max_propagate_nan_kernel(&self) -> Option> { self.max_ignore_nan_kernel() } + + fn min_max_propagate_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + self.min_max_ignore_nan_kernel() + } } }; } @@ -152,6 +263,17 @@ macro_rules! impl_min_max_kernel_float { .map(|s| s.reduce_max()) } + fn min_max_ignore_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + fold_agg_min_max_kernel::<$N, $T, _>( + self.values(), + self.validity(), + <$T>::NAN, + <$T>::NAN, + |(cmin, cmax), (min, max)| (cmin.simd_min(min), cmax.simd_max(max)), + ) + .map(|(min, max)| (min.reduce_min(), max.reduce_max())) + } + fn min_propagate_nan_kernel(&self) -> Option> { fold_agg_kernel::<$N, $T, _>( self.values(), @@ -171,6 +293,27 @@ macro_rules! impl_min_max_kernel_float { ) .map(|s| scalar_reduce_max_propagate_nan(s.as_array())) } + + fn min_max_propagate_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + fold_agg_min_max_kernel::<$N, $T, _>( + self.values(), + self.validity(), + <$T>::INFINITY, + <$T>::NEG_INFINITY, + |(cmin, cmax), (min, max)| { + ( + (cmin.simd_lt(min) | cmin.simd_ne(cmin)).select(cmin, min), + (cmax.simd_gt(max) | cmax.simd_ne(cmax)).select(cmax, max), + ) + }, + ) + .map(|(min, max)| { + ( + scalar_reduce_min_propagate_nan(min.as_array()), + scalar_reduce_max_propagate_nan(max.as_array()), + ) + }) + } } impl MinMaxKernel for [$T] { @@ -186,6 +329,17 @@ macro_rules! impl_min_max_kernel_float { .map(|s| s.reduce_max()) } + fn min_max_ignore_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + fold_agg_min_max_kernel::<$N, $T, _>( + self, + None, + <$T>::NAN, + <$T>::NAN, + |(cmin, cmax), (min, max)| (cmin.simd_min(min), cmax.simd_max(max)), + ) + .map(|(min, max)| (min.reduce_min(), max.reduce_max())) + } + fn min_propagate_nan_kernel(&self) -> Option> { fold_agg_kernel::<$N, $T, _>(self, None, <$T>::INFINITY, |a, b| { (a.simd_lt(b) | a.simd_ne(a)).select(a, b) @@ -199,6 +353,27 @@ macro_rules! impl_min_max_kernel_float { }) .map(|s| scalar_reduce_max_propagate_nan(s.as_array())) } + + fn min_max_propagate_nan_kernel(&self) -> Option<(Self::Scalar<'_>, Self::Scalar<'_>)> { + fold_agg_min_max_kernel::<$N, $T, _>( + self, + None, + <$T>::INFINITY, + <$T>::NEG_INFINITY, + |(cmin, cmax), (min, max)| { + ( + (cmin.simd_lt(min) | cmin.simd_ne(cmin)).select(cmin, min), + (cmax.simd_gt(max) | cmax.simd_ne(cmax)).select(cmax, max), + ) + }, + ) + .map(|(min, max)| { + ( + scalar_reduce_min_propagate_nan(min.as_array()), + scalar_reduce_max_propagate_nan(max.as_array()), + ) + }) + } } }; } diff --git a/crates/polars-parquet/src/arrow/write/primitive/basic.rs b/crates/polars-parquet/src/arrow/write/primitive/basic.rs index 14fc2a910ab5..b914978ea8db 100644 --- a/crates/polars-parquet/src/arrow/write/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/basic.rs @@ -26,20 +26,47 @@ where T: num_traits::AsPrimitive

, { if is_optional { - buffer.reserve(std::mem::size_of::

() * (array.len() - array.null_count())); // append the non-null values - for x in array.non_null_values_iter() { - let parquet_native: P = x.as_(); - buffer.extend_from_slice(parquet_native.to_le_bytes().as_ref()) + let validity = array.validity(); + + if let Some(validity) = validity { + let null_count = validity.unset_bits(); + + if null_count > 0 { + let values = array.values().as_slice(); + let mut iter = validity.iter(); + + buffer.reserve(std::mem::size_of::

() * (array.len() - null_count)); + + let mut offset = 0; + let mut remaining_valid = array.len() - null_count; + while remaining_valid > 0 { + let num_valid = iter.take_leading_ones(); + buffer.extend( + values[offset..offset + num_valid] + .iter() + .flat_map(|value| value.as_().to_le_bytes()), + ); + remaining_valid -= num_valid; + offset += num_valid; + + let num_invalid = iter.take_leading_zeros(); + offset += num_invalid; + } + + return buffer; + } } - } else { - buffer.reserve(std::mem::size_of::

() * array.len()); - // append all values - array.values().iter().for_each(|x| { - let parquet_native: P = x.as_(); - buffer.extend_from_slice(parquet_native.to_le_bytes().as_ref()) - }); } + + buffer.reserve(std::mem::size_of::

() * array.len()); + buffer.extend( + array + .values() + .iter() + .flat_map(|value| value.as_().to_le_bytes()), + ); + buffer } @@ -168,31 +195,44 @@ where P: ParquetNativeType, T: num_traits::AsPrimitive

, { + let (min_value, max_value) = match (options.min_value, options.max_value) { + (true, true) => { + match polars_compute::min_max::dyn_array_min_max_propagate_nan(array as &dyn Array) { + None => (None, None), + Some((l, r)) => (Some(l), Some(r)), + } + }, + (true, false) => ( + polars_compute::min_max::dyn_array_min_propagate_nan(array as &dyn Array), + None, + ), + (false, true) => ( + None, + polars_compute::min_max::dyn_array_max_propagate_nan(array as &dyn Array), + ), + (false, false) => (None, None), + }; + + let min_value = min_value.and_then(|s| { + s.as_any() + .downcast_ref::>() + .unwrap() + .value() + .map(|x| x.as_()) + }); + let max_value = max_value.and_then(|s| { + s.as_any() + .downcast_ref::>() + .unwrap() + .value() + .map(|x| x.as_()) + }); + PrimitiveStatistics::

{ primitive_type, null_count: options.null_count.then_some(array.null_count() as i64), distinct_count: None, - max_value: options - .max_value - .then(|| { - let scalar = - polars_compute::min_max::dyn_array_max_propagate_nan(array as &dyn Array); - scalar.and_then(|s| { - let s = s.as_any().downcast_ref::>().unwrap(); - s.value().map(|x| x.as_()) - }) - }) - .flatten(), - min_value: options - .min_value - .then(|| { - let scalar = - polars_compute::min_max::dyn_array_min_propagate_nan(array as &dyn Array); - scalar.and_then(|s| { - let s = s.as_any().downcast_ref::>().unwrap(); - s.value().map(|x| x.as_()) - }) - }) - .flatten(), + max_value, + min_value, } } diff --git a/crates/polars-parquet/src/arrow/write/utils.rs b/crates/polars-parquet/src/arrow/write/utils.rs index f18290f4b350..bbbe177af648 100644 --- a/crates/polars-parquet/src/arrow/write/utils.rs +++ b/crates/polars-parquet/src/arrow/write/utils.rs @@ -4,7 +4,7 @@ use polars_error::*; use super::{Version, WriteOptions}; use crate::parquet::compression::CompressionOptions; -use crate::parquet::encoding::hybrid_rle::encode; +use crate::parquet::encoding::hybrid_rle::{self, encode}; use crate::parquet::encoding::Encoding; use crate::parquet::metadata::Descriptor; use crate::parquet::page::{DataPage, DataPageHeader, DataPageHeaderV1, DataPageHeaderV2}; @@ -12,34 +12,6 @@ use crate::parquet::schema::types::PrimitiveType; use crate::parquet::statistics::ParquetStatistics; use crate::parquet::CowBuffer; -fn encode_iter_v1>(buffer: &mut Vec, iter: I) -> PolarsResult<()> { - buffer.extend_from_slice(&[0; 4]); - let start = buffer.len(); - encode::(buffer, iter, 1)?; - let end = buffer.len(); - let length = end - start; - - // write the first 4 bytes as length - let length = (length as i32).to_le_bytes(); - (0..4).for_each(|i| buffer[start - 4 + i] = length[i]); - Ok(()) -} - -fn encode_iter_v2>(writer: &mut Vec, iter: I) -> PolarsResult<()> { - Ok(encode::(writer, iter, 1)?) -} - -fn encode_iter>( - writer: &mut Vec, - iter: I, - version: Version, -) -> PolarsResult<()> { - match version { - Version::V1 => encode_iter_v1(writer, iter), - Version::V2 => encode_iter_v2(writer, iter), - } -} - /// writes the def levels to a `Vec` and returns it. pub fn write_def_levels( writer: &mut Vec, @@ -48,11 +20,35 @@ pub fn write_def_levels( len: usize, version: Version, ) -> PolarsResult<()> { - // encode def levels - match (is_optional, validity) { - (true, Some(validity)) => encode_iter(writer, validity.iter(), version), - (true, None) => encode_iter(writer, std::iter::repeat(true).take(len), version), - _ => Ok(()), // is required => no def levels + if is_optional { + match version { + Version::V1 => { + writer.extend(&[0, 0, 0, 0]); + let start = writer.len(); + + match validity { + None => >::run_length_encode( + writer, len, true, 1, + )?, + Some(validity) => encode::(writer, validity.iter(), 1)?, + } + + // write the first 4 bytes as length + let length = ((writer.len() - start) as i32).to_le_bytes(); + (0..4).for_each(|i| writer[start - 4 + i] = length[i]); + }, + Version::V2 => match validity { + None => { + >::run_length_encode(writer, len, true, 1)? + }, + Some(validity) => encode::(writer, validity.iter(), 1)?, + }, + } + + Ok(()) + } else { + // is required => no def levels + Ok(()) } } diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs index e5e935ab525f..4ee48ef18e81 100644 --- a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs @@ -9,7 +9,7 @@ mod fuzz; pub use bitmap::{encode_bool as bitpacked_encode, BitmapIter}; pub use buffered::BufferedBitpacked; -pub use encoder::encode; +pub use encoder::{encode, Encoder}; pub use gatherer::{ DictionaryTranslator, FnTranslator, Translator, TryFromUsizeTranslator, UnitTranslator, }; diff --git a/crates/polars-parquet/src/parquet/types.rs b/crates/polars-parquet/src/parquet/types.rs index fabb37aa3d2c..7591f6ba0bd7 100644 --- a/crates/polars-parquet/src/parquet/types.rs +++ b/crates/polars-parquet/src/parquet/types.rs @@ -4,6 +4,7 @@ use crate::parquet::schema::types::PhysicalType; pub trait NativeType: std::fmt::Debug + Send + Sync + 'static + Copy + Clone { type Bytes: AsRef<[u8]> + bytemuck::Pod + + IntoIterator + for<'a> TryFrom<&'a [u8], Error = std::array::TryFromSliceError> + std::fmt::Debug + Clone