From 4c38f9543a8a056cb37c4a59dd3fc97018f003e5 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 10 Oct 2024 13:32:42 -0400 Subject: [PATCH 1/5] checkpoint runend changes --- encodings/runend/src/compress.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index f50d6a9a2..ae50956af 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -3,6 +3,7 @@ use std::cmp::min; use itertools::Itertools; use num_traits::{AsPrimitive, FromPrimitive}; use vortex::array::PrimitiveArray; +use vortex::compute::unary::scalar_at; use vortex::stats::{ArrayStatistics, Stat}; use vortex::validity::Validity; use vortex::ArrayDType; @@ -20,19 +21,21 @@ pub fn runend_encode(array: &PrimitiveArray) -> (PrimitiveArray, PrimitiveArray) let (ends, values) = runend_encode_primitive(array.maybe_null_slice::<$P>()); let mut compressed_values = PrimitiveArray::from_vec(values, validity); - compressed_values.statistics().set(Stat::IsConstant, false.into()); compressed_values.statistics().set(Stat::RunCount, compressed_values.len().into()); array.statistics().get(Stat::Min).map(|s| compressed_values.statistics().set(Stat::Min, s)); array.statistics().get(Stat::Max).map(|s| compressed_values.statistics().set(Stat::Max, s)); + array.statistics().get(Stat::IsConstant).map(|s| compressed_values.statistics().set(Stat::IsConstant, s)); array.statistics().get(Stat::IsSorted).map(|s| compressed_values.statistics().set(Stat::IsSorted, s)); array.statistics().get(Stat::IsStrictSorted).map(|s| compressed_values.statistics().set(Stat::IsStrictSorted, s)); let compressed_ends = PrimitiveArray::from(ends); + compressed_ends.statistics().set(Stat::IsConstant, (compressed_ends.len() > 1).into()); compressed_ends.statistics().set(Stat::IsSorted, true.into()); compressed_ends.statistics().set(Stat::IsStrictSorted, true.into()); - compressed_ends.statistics().set(Stat::IsConstant, false.into()); - compressed_ends.statistics().set(Stat::Max, array.len().into()); - compressed_ends.statistics().set(Stat::RunCount, compressed_ends.len().into()); + if (compressed_ends.len() > 0) { + compressed_ends.statistics().set(Stat::Min, scalar_at(&compressed_ends, 0).unwrap().into()); + compressed_ends.statistics().set(Stat::Max, scalar_at(&compressed_ends, compressed_ends.len() - 1).unwrap().into()); + } assert_eq!(array.dtype(), compressed_values.dtype()); (compressed_ends, compressed_values) From ae8a12412996b6708f2000bde68e6830a93ad52c Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 10 Oct 2024 13:55:35 -0400 Subject: [PATCH 2/5] moar --- encodings/runend/src/compress.rs | 50 ++++++++++++++++++-------------- encodings/runend/src/compute.rs | 1 - encodings/runend/src/runend.rs | 30 +++++++++++-------- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index ae50956af..e31574d09 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -8,7 +8,7 @@ use vortex::stats::{ArrayStatistics, Stat}; use vortex::validity::Validity; use vortex::ArrayDType; use vortex_dtype::{match_each_integer_ptype, match_each_native_ptype, NativePType, Nullability}; -use vortex_error::{vortex_panic, VortexResult}; +use vortex_error::{vortex_panic, VortexResult, VortexUnwrap as _}; pub fn runend_encode(array: &PrimitiveArray) -> (PrimitiveArray, PrimitiveArray) { let validity = if array.dtype().nullability() == Nullability::NonNullable { @@ -17,29 +17,37 @@ pub fn runend_encode(array: &PrimitiveArray) -> (PrimitiveArray, PrimitiveArray) Validity::AllValid }; - match_each_native_ptype!(array.ptype(), |$P| { + let (compressed_ends, compressed_values) = match_each_native_ptype!(array.ptype(), |$P| { let (ends, values) = runend_encode_primitive(array.maybe_null_slice::<$P>()); + (PrimitiveArray::from_vec(ends, Validity::NonNullable), PrimitiveArray::from_vec(values, validity)) + }); - let mut compressed_values = PrimitiveArray::from_vec(values, validity); - compressed_values.statistics().set(Stat::RunCount, compressed_values.len().into()); - array.statistics().get(Stat::Min).map(|s| compressed_values.statistics().set(Stat::Min, s)); - array.statistics().get(Stat::Max).map(|s| compressed_values.statistics().set(Stat::Max, s)); - array.statistics().get(Stat::IsConstant).map(|s| compressed_values.statistics().set(Stat::IsConstant, s)); - array.statistics().get(Stat::IsSorted).map(|s| compressed_values.statistics().set(Stat::IsSorted, s)); - array.statistics().get(Stat::IsStrictSorted).map(|s| compressed_values.statistics().set(Stat::IsStrictSorted, s)); - - let compressed_ends = PrimitiveArray::from(ends); - compressed_ends.statistics().set(Stat::IsConstant, (compressed_ends.len() > 1).into()); - compressed_ends.statistics().set(Stat::IsSorted, true.into()); - compressed_ends.statistics().set(Stat::IsStrictSorted, true.into()); - if (compressed_ends.len() > 0) { - compressed_ends.statistics().set(Stat::Min, scalar_at(&compressed_ends, 0).unwrap().into()); - compressed_ends.statistics().set(Stat::Max, scalar_at(&compressed_ends, compressed_ends.len() - 1).unwrap().into()); - } + // the values array stats are trivially derived + compressed_values.statistics().set(Stat::RunCount, compressed_values.len().into()); + compressed_values.statistics().set(Stat::IsConstant, (compressed_values.len() == 1).into()); + if let Some(min) = array.statistics().get(Stat::Min) { + compressed_values.statistics().set(Stat::Min, min); + } + if let Some(max) = array.statistics().get(Stat::Max) { + compressed_values.statistics().set(Stat::Max, max); + } + if let Some(is_sorted) = array.statistics().get(Stat::IsSorted) { + compressed_values.statistics().set(Stat::IsSorted, is_sorted); + } + if let Some(is_strict_sorted) = array.statistics().get(Stat::IsStrictSorted) { + compressed_values.statistics().set(Stat::IsStrictSorted, is_strict_sorted); + } + + compressed_ends.statistics().set(Stat::IsConstant, (compressed_ends.len() == 1).into()); + compressed_ends.statistics().set(Stat::IsSorted, true.into()); + compressed_ends.statistics().set(Stat::IsStrictSorted, true.into()); + if !compressed_ends.is_empty() { + compressed_ends.statistics().set(Stat::Min, scalar_at(&compressed_ends, 0).vortex_unwrap()); + compressed_ends.statistics().set(Stat::Max, scalar_at(&compressed_ends, compressed_ends.len() - 1).vortex_unwrap()); + } - assert_eq!(array.dtype(), compressed_values.dtype()); - (compressed_ends, compressed_values) - }) + assert_eq!(array.dtype(), compressed_values.dtype()); + (compressed_ends, compressed_values) } fn runend_encode_primitive(elements: &[T]) -> (Vec, Vec) { diff --git a/encodings/runend/src/compute.rs b/encodings/runend/src/compute.rs index 2955dbec3..99ec7110d 100644 --- a/encodings/runend/src/compute.rs +++ b/encodings/runend/src/compute.rs @@ -103,7 +103,6 @@ impl SliceFn for RunEndArray { slice(self.ends(), slice_begin, slice_end + 1)?, slice(self.values(), slice_begin, slice_end + 1)?, self.validity().slice(start, stop)?, - stop - start, start + self.offset(), )? .into_array()) diff --git a/encodings/runend/src/runend.rs b/encodings/runend/src/runend.rs index 13c5460da..d5bbd9886 100644 --- a/encodings/runend/src/runend.rs +++ b/encodings/runend/src/runend.rs @@ -13,7 +13,7 @@ use vortex::{ impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArray, IntoArrayVariant, IntoCanonical, }; -use vortex_dtype::DType; +use vortex_dtype::PType; use vortex_error::{vortex_bail, VortexExpect as _, VortexResult}; use crate::compress::{runend_decode, runend_encode}; @@ -23,10 +23,9 @@ impl_encoding!("vortex.runend", ids::RUN_END, RunEnd); #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RunEndMetadata { validity: ValidityMetadata, - ends_dtype: DType, + ends_ptype: PType, num_runs: usize, offset: usize, - length: usize, } impl Display for RunEndMetadata { @@ -37,20 +36,18 @@ impl Display for RunEndMetadata { impl RunEndArray { pub fn try_new(ends: Array, values: Array, validity: Validity) -> VortexResult { - let length: usize = scalar_at(&ends, ends.len() - 1)?.as_ref().try_into()?; - Self::with_offset_and_size(ends, values, validity, length, 0) + Self::with_offset_and_size(ends, values, validity, 0) } pub(crate) fn with_offset_and_size( ends: Array, values: Array, validity: Validity, - length: usize, offset: usize, ) -> VortexResult { - if values.dtype().is_nullable() == (validity == Validity::NonNullable) { + if values.dtype().nullability() != validity.nullability() { vortex_bail!( - "incorrect validity {:?} for dtype {}", + "invalid validity {:?} for dtype {}", validity, values.dtype() ); @@ -63,16 +60,25 @@ impl RunEndArray { } } + if !ends.dtype().is_unsigned_int() || ends.dtype().is_nullable() { + vortex_bail!(MismatchedTypes: "non-nullable unsigned int", ends.dtype()); + } if !ends.statistics().compute_is_strict_sorted().unwrap_or(true) { - vortex_bail!("Ends array must be strictly sorted",); + vortex_bail!("Ends array must be strictly sorted"); } + + let length = if ends.is_empty() { + 0 + } else { + scalar_at(&ends, ends.len() - 1)?.as_ref().try_into()? + }; + let dtype = values.dtype().clone(); let metadata = RunEndMetadata { validity: validity.to_metadata(length)?, - ends_dtype: ends.dtype().clone(), + ends_ptype: PType::try_from(ends.dtype())?, num_runs: ends.len(), offset, - length, }; let mut children = Vec::with_capacity(3); @@ -142,7 +148,7 @@ impl RunEndArray { #[inline] pub fn ends(&self) -> Array { self.as_ref() - .child(0, &self.metadata().ends_dtype, self.metadata().num_runs) + .child(0, &self.metadata().ends_ptype.into(), self.metadata().num_runs) .vortex_expect("RunEndArray is missing its run ends") } From b9a556d4bc5e2e78549123192d8501b99280f046 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 10 Oct 2024 15:24:46 -0400 Subject: [PATCH 3/5] bring back length (a little bit) --- encodings/runend/src/runend.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/encodings/runend/src/runend.rs b/encodings/runend/src/runend.rs index d5bbd9886..d1a88c1fb 100644 --- a/encodings/runend/src/runend.rs +++ b/encodings/runend/src/runend.rs @@ -36,7 +36,12 @@ impl Display for RunEndMetadata { impl RunEndArray { pub fn try_new(ends: Array, values: Array, validity: Validity) -> VortexResult { - Self::with_offset_and_size(ends, values, validity, 0) + let length = if ends.is_empty() { + 0 + } else { + scalar_at(&ends, ends.len() - 1)?.as_ref().try_into()? + }; + Self::with_offset_and_size(ends, values, validity, 0, length) } pub(crate) fn with_offset_and_size( @@ -44,6 +49,7 @@ impl RunEndArray { values: Array, validity: Validity, offset: usize, + length: usize, ) -> VortexResult { if values.dtype().nullability() != validity.nullability() { vortex_bail!( @@ -67,12 +73,6 @@ impl RunEndArray { vortex_bail!("Ends array must be strictly sorted"); } - let length = if ends.is_empty() { - 0 - } else { - scalar_at(&ends, ends.len() - 1)?.as_ref().try_into()? - }; - let dtype = values.dtype().clone(); let metadata = RunEndMetadata { validity: validity.to_metadata(length)?, From 193e81ce65a9820cd477f53b58f685d224f5fd36 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 10 Oct 2024 15:27:59 -0400 Subject: [PATCH 4/5] fix test --- encodings/runend/src/compute.rs | 5 +++-- encodings/runend/src/runend.rs | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/encodings/runend/src/compute.rs b/encodings/runend/src/compute.rs index 99ec7110d..e4b8ccff1 100644 --- a/encodings/runend/src/compute.rs +++ b/encodings/runend/src/compute.rs @@ -99,11 +99,12 @@ impl SliceFn for RunEndArray { let slice_begin = self.find_physical_index(start)?; let slice_end = self.find_physical_index(stop)?; - Ok(Self::with_offset_and_size( + Ok(Self::with_offset_and_length( slice(self.ends(), slice_begin, slice_end + 1)?, slice(self.values(), slice_begin, slice_end + 1)?, self.validity().slice(start, stop)?, start + self.offset(), + stop - start, )? .into_array()) } @@ -186,7 +187,7 @@ mod test { #[test] fn slice_with_nulls() { let array = RunEndArray::try_new( - PrimitiveArray::from(vec![3, 6, 8, 12]).into_array(), + PrimitiveArray::from(vec![3u32, 6, 8, 12]).into_array(), PrimitiveArray::from_vec(vec![1, 4, 2, 5], Validity::AllValid).into_array(), Validity::from(vec![ false, false, false, false, true, true, false, false, false, false, true, true, diff --git a/encodings/runend/src/runend.rs b/encodings/runend/src/runend.rs index d1a88c1fb..165efcda9 100644 --- a/encodings/runend/src/runend.rs +++ b/encodings/runend/src/runend.rs @@ -41,10 +41,10 @@ impl RunEndArray { } else { scalar_at(&ends, ends.len() - 1)?.as_ref().try_into()? }; - Self::with_offset_and_size(ends, values, validity, 0, length) + Self::with_offset_and_length(ends, values, validity, 0, length) } - pub(crate) fn with_offset_and_size( + pub(crate) fn with_offset_and_length( ends: Array, values: Array, validity: Validity, From 3c78a0427847c51feafd7010e012c2a6d84156a1 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 10 Oct 2024 15:28:41 -0400 Subject: [PATCH 5/5] fmt --- encodings/runend/src/compress.rs | 39 ++++++++++++++++++++++++-------- encodings/runend/src/runend.rs | 6 ++++- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/encodings/runend/src/compress.rs b/encodings/runend/src/compress.rs index e31574d09..5ec84f6bc 100644 --- a/encodings/runend/src/compress.rs +++ b/encodings/runend/src/compress.rs @@ -23,8 +23,12 @@ pub fn runend_encode(array: &PrimitiveArray) -> (PrimitiveArray, PrimitiveArray) }); // the values array stats are trivially derived - compressed_values.statistics().set(Stat::RunCount, compressed_values.len().into()); - compressed_values.statistics().set(Stat::IsConstant, (compressed_values.len() == 1).into()); + compressed_values + .statistics() + .set(Stat::RunCount, compressed_values.len().into()); + compressed_values + .statistics() + .set(Stat::IsConstant, (compressed_values.len() == 1).into()); if let Some(min) = array.statistics().get(Stat::Min) { compressed_values.statistics().set(Stat::Min, min); } @@ -32,18 +36,33 @@ pub fn runend_encode(array: &PrimitiveArray) -> (PrimitiveArray, PrimitiveArray) compressed_values.statistics().set(Stat::Max, max); } if let Some(is_sorted) = array.statistics().get(Stat::IsSorted) { - compressed_values.statistics().set(Stat::IsSorted, is_sorted); + compressed_values + .statistics() + .set(Stat::IsSorted, is_sorted); } if let Some(is_strict_sorted) = array.statistics().get(Stat::IsStrictSorted) { - compressed_values.statistics().set(Stat::IsStrictSorted, is_strict_sorted); + compressed_values + .statistics() + .set(Stat::IsStrictSorted, is_strict_sorted); } - - compressed_ends.statistics().set(Stat::IsConstant, (compressed_ends.len() == 1).into()); - compressed_ends.statistics().set(Stat::IsSorted, true.into()); - compressed_ends.statistics().set(Stat::IsStrictSorted, true.into()); + + compressed_ends + .statistics() + .set(Stat::IsConstant, (compressed_ends.len() == 1).into()); + compressed_ends + .statistics() + .set(Stat::IsSorted, true.into()); + compressed_ends + .statistics() + .set(Stat::IsStrictSorted, true.into()); if !compressed_ends.is_empty() { - compressed_ends.statistics().set(Stat::Min, scalar_at(&compressed_ends, 0).vortex_unwrap()); - compressed_ends.statistics().set(Stat::Max, scalar_at(&compressed_ends, compressed_ends.len() - 1).vortex_unwrap()); + compressed_ends + .statistics() + .set(Stat::Min, scalar_at(&compressed_ends, 0).vortex_unwrap()); + compressed_ends.statistics().set( + Stat::Max, + scalar_at(&compressed_ends, compressed_ends.len() - 1).vortex_unwrap(), + ); } assert_eq!(array.dtype(), compressed_values.dtype()); diff --git a/encodings/runend/src/runend.rs b/encodings/runend/src/runend.rs index 165efcda9..0fbb47656 100644 --- a/encodings/runend/src/runend.rs +++ b/encodings/runend/src/runend.rs @@ -148,7 +148,11 @@ impl RunEndArray { #[inline] pub fn ends(&self) -> Array { self.as_ref() - .child(0, &self.metadata().ends_ptype.into(), self.metadata().num_runs) + .child( + 0, + &self.metadata().ends_ptype.into(), + self.metadata().num_runs, + ) .vortex_expect("RunEndArray is missing its run ends") }