diff --git a/crates/store/re_types_core/src/size_bytes.rs b/crates/store/re_types_core/src/size_bytes/arrow2_sizes.rs similarity index 71% rename from crates/store/re_types_core/src/size_bytes.rs rename to crates/store/re_types_core/src/size_bytes/arrow2_sizes.rs index 3c821883eb08..6039c4bdcc4a 100644 --- a/crates/store/re_types_core/src/size_bytes.rs +++ b/crates/store/re_types_core/src/size_bytes/arrow2_sizes.rs @@ -1,386 +1,23 @@ -use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; -use std::sync::Arc; - -use arrow2::datatypes::{DataType, Field}; -use arrow2::types::{NativeType, Offset}; -use smallvec::SmallVec; - -// --- - -/// Approximations of stack and heap size for both internal and external types. -/// -/// Motly used for statistics and triggering events such as garbage collection. -pub trait SizeBytes { - /// Returns the total size of `self` in bytes, accounting for both stack and heap space. - #[inline] - fn total_size_bytes(&self) -> u64 { - self.stack_size_bytes() + self.heap_size_bytes() - } - - /// Returns the total size of `self` on the stack, in bytes. - /// - /// Defaults to `std::mem::size_of_val(self)`. - #[inline] - fn stack_size_bytes(&self) -> u64 { - std::mem::size_of_val(self) as _ - } - - /// Returns the total size of `self` on the heap, in bytes. - fn heap_size_bytes(&self) -> u64; - - /// Is `Self` just plain old data? - /// - /// If `true`, this will make most blanket implementations of `SizeBytes` much faster (e.g. `Vec`). - #[inline] - fn is_pod() -> bool { - false - } -} - -// TODO(rust-lang/rust#31844): This isn't happening without specialization. -// impl SizeBytes for T where T: bytemuck::Pod { … } - -// --- Std --- - -impl SizeBytes for String { - /// Does not take capacity into account. - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.as_bytes().len() as u64 - } -} - -impl SizeBytes for BTreeMap { - #[inline] - fn heap_size_bytes(&self) -> u64 { - // NOTE: It's all on the heap at this point. - - let keys_size_bytes = if K::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.keys().map(SizeBytes::total_size_bytes).sum::() - }; - - let values_size_bytes = if V::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.values().map(SizeBytes::total_size_bytes).sum::() - }; - - keys_size_bytes + values_size_bytes - } -} - -impl SizeBytes for BTreeSet { - #[inline] - fn heap_size_bytes(&self) -> u64 { - // NOTE: It's all on the heap at this point. - - if K::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.iter().map(SizeBytes::total_size_bytes).sum::() - } - } -} - -impl SizeBytes for HashMap { - #[inline] - fn heap_size_bytes(&self) -> u64 { - // NOTE: It's all on the heap at this point. - - let keys_size_bytes = if K::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.keys().map(SizeBytes::total_size_bytes).sum::() - }; - - let values_size_bytes = if V::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.values().map(SizeBytes::total_size_bytes).sum::() - }; - - keys_size_bytes + values_size_bytes - } -} - -// NOTE: Do _not_ implement `SizeBytes` for slices: we cannot know whether they point to the stack -// or the heap! - -impl SizeBytes for [T; N] { - #[inline] - fn heap_size_bytes(&self) -> u64 { - if T::is_pod() { - 0 // it's a const-sized array - } else { - self.iter().map(SizeBytes::heap_size_bytes).sum::() - } - } -} - -impl SizeBytes for Vec { - /// Does not take capacity into account. - #[inline] - fn heap_size_bytes(&self) -> u64 { - // NOTE: It's all on the heap at this point. - if T::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.iter().map(SizeBytes::total_size_bytes).sum::() - } - } -} - -impl SizeBytes for VecDeque { - /// Does not take capacity into account. - #[inline] - fn heap_size_bytes(&self) -> u64 { - // NOTE: It's all on the heap at this point. - if T::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.iter().map(SizeBytes::total_size_bytes).sum::() - } - } -} - -impl SizeBytes for SmallVec<[T; N]> { - /// Does not take capacity into account. - #[inline] - fn heap_size_bytes(&self) -> u64 { - if self.len() <= N { - // The `SmallVec` is still smaller than the threshold so no heap data has been - // allocated yet, beyond the heap data each element might have. - - if T::is_pod() { - 0 // early-out - } else { - self.iter().map(SizeBytes::heap_size_bytes).sum::() - } - } else { - // NOTE: It's all on the heap at this point. - if T::is_pod() { - (self.len() * std::mem::size_of::()) as _ - } else { - self.iter().map(SizeBytes::total_size_bytes).sum::() - } - } - } -} - -impl SizeBytes for Option { - #[inline] - fn heap_size_bytes(&self) -> u64 { - self.as_ref().map_or(0, SizeBytes::heap_size_bytes) - } -} - -impl SizeBytes for Arc { - #[inline] - fn heap_size_bytes(&self) -> u64 { - 0 // assume it's amortized - } -} - -impl SizeBytes for Box { - #[inline] - fn heap_size_bytes(&self) -> u64 { - T::total_size_bytes(&**self) - } -} - -// TODO(rust-lang/rust#31844): `impl SizeBytesExt for T {}` would be nice but -// violates orphan rules. -macro_rules! impl_size_bytes_pod { - ($ty:ty) => { - impl SizeBytes for $ty { - #[inline] - fn heap_size_bytes(&self) -> u64 { - 0 - } - - #[inline] - fn is_pod() -> bool { - true - } - } - }; - ($ty:ty, $($rest:ty),+) => { - impl_size_bytes_pod!($ty); impl_size_bytes_pod!($($rest),+); - }; -} - -impl_size_bytes_pod!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128, bool, f32, f64); -impl_size_bytes_pod!(half::f16); - -impl SizeBytes for (T, U) -where - T: SizeBytes, - U: SizeBytes, -{ - #[inline] - fn heap_size_bytes(&self) -> u64 { - let (a, b) = self; - a.heap_size_bytes() + b.heap_size_bytes() - } - - #[inline] - fn is_pod() -> bool { - T::is_pod() && U::is_pod() - } -} - -impl SizeBytes for (T, U, V) -where - T: SizeBytes, - U: SizeBytes, - V: SizeBytes, -{ - #[inline] - fn heap_size_bytes(&self) -> u64 { - let (a, b, c) = self; - a.heap_size_bytes() + b.heap_size_bytes() + c.heap_size_bytes() - } - - #[inline] - fn is_pod() -> bool { - T::is_pod() && U::is_pod() && V::is_pod() - } -} - -impl SizeBytes for (T, U, V, W) -where - T: SizeBytes, - U: SizeBytes, - V: SizeBytes, - W: SizeBytes, -{ - #[inline] - fn heap_size_bytes(&self) -> u64 { - let (a, b, c, d) = self; - a.heap_size_bytes() + b.heap_size_bytes() + c.heap_size_bytes() + d.heap_size_bytes() - } - - #[inline] - fn is_pod() -> bool { - T::is_pod() && U::is_pod() && V::is_pod() && W::is_pod() - } -} - -// --- Arrow --- - -impl SizeBytes for DataType { - #[inline] - fn heap_size_bytes(&self) -> u64 { - match self { - Self::Null - | Self::Binary - | Self::Boolean - | Self::Date32 - | Self::Date64 - | Self::Float16 - | Self::Float32 - | Self::Float64 - | Self::Int16 - | Self::Int32 - | Self::Int64 - | Self::Int8 - | Self::LargeBinary - | Self::LargeUtf8 - | Self::UInt16 - | Self::UInt32 - | Self::UInt64 - | Self::UInt8 - | Self::Time32(_) - | Self::Time64(_) - | Self::Duration(_) - | Self::Interval(_) - | Self::FixedSizeBinary(_) - | Self::Decimal(_, _) - | Self::Decimal256(_, _) - | Self::Utf8 => 0, - Self::Timestamp(_, str) => str.heap_size_bytes(), - Self::List(field) - | Self::FixedSizeList(field, _) - | Self::LargeList(field) - | Self::Map(field, _) => field.total_size_bytes(), // NOTE: Boxed, it's all on the heap - Self::Struct(fields) => fields.heap_size_bytes(), - Self::Union(fields, indices, _) => fields.heap_size_bytes() + indices.heap_size_bytes(), - Self::Dictionary(_, datatype, _) => datatype.total_size_bytes(), // NOTE: Boxed, it's all on the heap - Self::Extension(name, datatype, extra) => { - name.heap_size_bytes() - + datatype.total_size_bytes() // NOTE: Boxed, it's all on the heap - + extra.heap_size_bytes() - } - } - } -} - -impl SizeBytes for Field { - #[inline] - fn heap_size_bytes(&self) -> u64 { - let Self { - name, - data_type, - is_nullable, - metadata, - } = self; - - name.heap_size_bytes() - + data_type.heap_size_bytes() - + is_nullable.heap_size_bytes() - + metadata.heap_size_bytes() - } -} - -impl SizeBytes for dyn Array { - #[inline] - fn heap_size_bytes(&self) -> u64 { - estimated_bytes_size(self) as _ - } -} - -impl SizeBytes for Box { - #[inline] - fn heap_size_bytes(&self) -> u64 { - estimated_bytes_size(&**self as _) as _ - } -} - -impl SizeBytes for PrimitiveArray { - #[inline] - fn heap_size_bytes(&self) -> u64 { - estimated_bytes_size(self) as _ - } -} - -impl SizeBytes for ListArray { - #[inline] - fn heap_size_bytes(&self) -> u64 { - estimated_bytes_size(self) as _ - } -} - -impl SizeBytes for StructArray { - #[inline] - fn heap_size_bytes(&self) -> u64 { - estimated_bytes_size(self) as _ - } -} - -// --- Arrow estimations --- - -// The following is a modified version of [1], available under MIT OR Apache-2.0. -// -// [1] https://github.com/jorgecarleitao/arrow2/blob/v0.16.0/src/compute/aggregate/memory.rs - -use arrow2::array::{ - Array, BinaryArray, BooleanArray, DictionaryArray, FixedSizeBinaryArray, FixedSizeListArray, - ListArray, MapArray, PrimitiveArray, StructArray, UnionArray, Utf8Array, +//! --- Arrow2 size estimations --- +//! +//! The following is a modified version of [1], available under MIT OR Apache-2.0. +//! +//! [1] + +use std::collections::BTreeMap; + +use arrow2::{ + array::{ + Array, BinaryArray, BooleanArray, DictionaryArray, FixedSizeBinaryArray, + FixedSizeListArray, ListArray, MapArray, PrimitiveArray, StructArray, UnionArray, + Utf8Array, + }, + bitmap::Bitmap, + datatypes::{DataType, Field, PhysicalType}, + types::{NativeType, Offset}, }; -use arrow2::bitmap::Bitmap; -use arrow2::datatypes::PhysicalType; + +use super::SizeBytes; macro_rules! with_match_primitive_type {( $key_type:expr, | $_:tt $T:ident | $($body:tt)* @@ -630,7 +267,7 @@ fn estimated_bytes_size(array: &dyn Array) -> usize { // Arrow array?". #[test] #[allow(clippy::from_iter_instead_of_collect)] -fn test_arrow_estimated_size_bytes() { +fn test_arrow2_estimated_size_bytes() { use arrow2::{ array::{Array, Float64Array, ListArray, StructArray, UInt64Array, Utf8Array}, buffer::Buffer, @@ -814,3 +451,102 @@ fn test_arrow_estimated_size_bytes() { assert_eq!(raw_size_bytes, arrow_size_bytes); } } + +impl SizeBytes for DataType { + #[inline] + fn heap_size_bytes(&self) -> u64 { + match self { + Self::Null + | Self::Binary + | Self::Boolean + | Self::Date32 + | Self::Date64 + | Self::Float16 + | Self::Float32 + | Self::Float64 + | Self::Int16 + | Self::Int32 + | Self::Int64 + | Self::Int8 + | Self::LargeBinary + | Self::LargeUtf8 + | Self::UInt16 + | Self::UInt32 + | Self::UInt64 + | Self::UInt8 + | Self::Time32(_) + | Self::Time64(_) + | Self::Duration(_) + | Self::Interval(_) + | Self::FixedSizeBinary(_) + | Self::Decimal(_, _) + | Self::Decimal256(_, _) + | Self::Utf8 => 0, + Self::Timestamp(_, str) => str.heap_size_bytes(), + Self::List(field) + | Self::FixedSizeList(field, _) + | Self::LargeList(field) + | Self::Map(field, _) => field.total_size_bytes(), // NOTE: Boxed, it's all on the heap + Self::Struct(fields) => fields.heap_size_bytes(), + Self::Union(fields, indices, _) => fields.heap_size_bytes() + indices.heap_size_bytes(), + Self::Dictionary(_, datatype, _) => datatype.total_size_bytes(), // NOTE: Boxed, it's all on the heap + Self::Extension(name, datatype, extra) => { + name.heap_size_bytes() + + datatype.total_size_bytes() // NOTE: Boxed, it's all on the heap + + extra.heap_size_bytes() + } + } + } +} + +impl SizeBytes for Field { + #[inline] + fn heap_size_bytes(&self) -> u64 { + let Self { + name, + data_type, + is_nullable, + metadata, + } = self; + + name.heap_size_bytes() + + data_type.heap_size_bytes() + + is_nullable.heap_size_bytes() + + metadata.heap_size_bytes() + } +} + +impl SizeBytes for dyn Array { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(self) as _ + } +} + +impl SizeBytes for Box { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(&**self as _) as _ + } +} + +impl SizeBytes for PrimitiveArray { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(self) as _ + } +} + +impl SizeBytes for ListArray { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(self) as _ + } +} + +impl SizeBytes for StructArray { + #[inline] + fn heap_size_bytes(&self) -> u64 { + estimated_bytes_size(self) as _ + } +} diff --git a/crates/store/re_types_core/src/size_bytes/mod.rs b/crates/store/re_types_core/src/size_bytes/mod.rs new file mode 100644 index 000000000000..c05a1bf1f5c4 --- /dev/null +++ b/crates/store/re_types_core/src/size_bytes/mod.rs @@ -0,0 +1,269 @@ +mod arrow2_sizes; + +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; +use std::sync::Arc; + +use smallvec::SmallVec; + +// --- + +/// Approximations of stack and heap size for both internal and external types. +/// +/// Motly used for statistics and triggering events such as garbage collection. +pub trait SizeBytes { + /// Returns the total size of `self` in bytes, accounting for both stack and heap space. + #[inline] + fn total_size_bytes(&self) -> u64 { + self.stack_size_bytes() + self.heap_size_bytes() + } + + /// Returns the total size of `self` on the stack, in bytes. + /// + /// Defaults to `std::mem::size_of_val(self)`. + #[inline] + fn stack_size_bytes(&self) -> u64 { + std::mem::size_of_val(self) as _ + } + + /// Returns the total size of `self` on the heap, in bytes. + fn heap_size_bytes(&self) -> u64; + + /// Is `Self` just plain old data? + /// + /// If `true`, this will make most blanket implementations of `SizeBytes` much faster (e.g. `Vec`). + #[inline] + fn is_pod() -> bool { + false + } +} + +// TODO(rust-lang/rust#31844): This isn't happening without specialization. +// impl SizeBytes for T where T: bytemuck::Pod { … } + +// --- Std --- + +impl SizeBytes for String { + /// Does not take capacity into account. + #[inline] + fn heap_size_bytes(&self) -> u64 { + self.as_bytes().len() as u64 + } +} + +impl SizeBytes for BTreeMap { + #[inline] + fn heap_size_bytes(&self) -> u64 { + // NOTE: It's all on the heap at this point. + + let keys_size_bytes = if K::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.keys().map(SizeBytes::total_size_bytes).sum::() + }; + + let values_size_bytes = if V::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.values().map(SizeBytes::total_size_bytes).sum::() + }; + + keys_size_bytes + values_size_bytes + } +} + +impl SizeBytes for BTreeSet { + #[inline] + fn heap_size_bytes(&self) -> u64 { + // NOTE: It's all on the heap at this point. + + if K::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.iter().map(SizeBytes::total_size_bytes).sum::() + } + } +} + +impl SizeBytes for HashMap { + #[inline] + fn heap_size_bytes(&self) -> u64 { + // NOTE: It's all on the heap at this point. + + let keys_size_bytes = if K::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.keys().map(SizeBytes::total_size_bytes).sum::() + }; + + let values_size_bytes = if V::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.values().map(SizeBytes::total_size_bytes).sum::() + }; + + keys_size_bytes + values_size_bytes + } +} + +// NOTE: Do _not_ implement `SizeBytes` for slices: we cannot know whether they point to the stack +// or the heap! + +impl SizeBytes for [T; N] { + #[inline] + fn heap_size_bytes(&self) -> u64 { + if T::is_pod() { + 0 // it's a const-sized array + } else { + self.iter().map(SizeBytes::heap_size_bytes).sum::() + } + } +} + +impl SizeBytes for Vec { + /// Does not take capacity into account. + #[inline] + fn heap_size_bytes(&self) -> u64 { + // NOTE: It's all on the heap at this point. + if T::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.iter().map(SizeBytes::total_size_bytes).sum::() + } + } +} + +impl SizeBytes for VecDeque { + /// Does not take capacity into account. + #[inline] + fn heap_size_bytes(&self) -> u64 { + // NOTE: It's all on the heap at this point. + if T::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.iter().map(SizeBytes::total_size_bytes).sum::() + } + } +} + +impl SizeBytes for SmallVec<[T; N]> { + /// Does not take capacity into account. + #[inline] + fn heap_size_bytes(&self) -> u64 { + if self.len() <= N { + // The `SmallVec` is still smaller than the threshold so no heap data has been + // allocated yet, beyond the heap data each element might have. + + if T::is_pod() { + 0 // early-out + } else { + self.iter().map(SizeBytes::heap_size_bytes).sum::() + } + } else { + // NOTE: It's all on the heap at this point. + if T::is_pod() { + (self.len() * std::mem::size_of::()) as _ + } else { + self.iter().map(SizeBytes::total_size_bytes).sum::() + } + } + } +} + +impl SizeBytes for Option { + #[inline] + fn heap_size_bytes(&self) -> u64 { + self.as_ref().map_or(0, SizeBytes::heap_size_bytes) + } +} + +impl SizeBytes for Arc { + #[inline] + fn heap_size_bytes(&self) -> u64 { + 0 // assume it's amortized + } +} + +impl SizeBytes for Box { + #[inline] + fn heap_size_bytes(&self) -> u64 { + T::total_size_bytes(&**self) + } +} + +// TODO(rust-lang/rust#31844): `impl SizeBytesExt for T {}` would be nice but +// violates orphan rules. +macro_rules! impl_size_bytes_pod { + ($ty:ty) => { + impl SizeBytes for $ty { + #[inline] + fn heap_size_bytes(&self) -> u64 { + 0 + } + + #[inline] + fn is_pod() -> bool { + true + } + } + }; + ($ty:ty, $($rest:ty),+) => { + impl_size_bytes_pod!($ty); impl_size_bytes_pod!($($rest),+); + }; +} + +impl_size_bytes_pod!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128, bool, f32, f64); +impl_size_bytes_pod!(half::f16); + +impl SizeBytes for (T, U) +where + T: SizeBytes, + U: SizeBytes, +{ + #[inline] + fn heap_size_bytes(&self) -> u64 { + let (a, b) = self; + a.heap_size_bytes() + b.heap_size_bytes() + } + + #[inline] + fn is_pod() -> bool { + T::is_pod() && U::is_pod() + } +} + +impl SizeBytes for (T, U, V) +where + T: SizeBytes, + U: SizeBytes, + V: SizeBytes, +{ + #[inline] + fn heap_size_bytes(&self) -> u64 { + let (a, b, c) = self; + a.heap_size_bytes() + b.heap_size_bytes() + c.heap_size_bytes() + } + + #[inline] + fn is_pod() -> bool { + T::is_pod() && U::is_pod() && V::is_pod() + } +} + +impl SizeBytes for (T, U, V, W) +where + T: SizeBytes, + U: SizeBytes, + V: SizeBytes, + W: SizeBytes, +{ + #[inline] + fn heap_size_bytes(&self) -> u64 { + let (a, b, c, d) = self; + a.heap_size_bytes() + b.heap_size_bytes() + c.heap_size_bytes() + d.heap_size_bytes() + } + + #[inline] + fn is_pod() -> bool { + T::is_pod() && U::is_pod() && V::is_pod() && W::is_pod() + } +}