From 8040c54a936c77f7e773ef593d739a96e80ea032 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 12 Jul 2024 11:13:27 -0400 Subject: [PATCH 1/5] minor update --- arrow-array/src/array/byte_view_array.rs | 3 +-- arrow-array/src/builder/generic_bytes_view_builder.rs | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index a00bf7271bb..dc4cbe6834c 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -325,8 +325,7 @@ impl GenericByteViewArray { /// Use with caution as this can be an expensive operation, only use it when you are sure that the view /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. pub fn gc(&self) -> Self { - let mut builder = - GenericByteViewBuilder::::with_capacity(self.len()).with_deduplicate_strings(); + let mut builder = GenericByteViewBuilder::::with_capacity(self.len()); for v in self.iter() { builder.append_option(v); diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index dda55354564..5368a6cb77f 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -201,7 +201,8 @@ impl GenericByteViewBuilder { /// Returns the value at the given index /// Useful if we want to know what value has been inserted to the builder - fn get_value(&self, index: usize) -> &[u8] { + /// The index has to be smaller than `self.len()`, otherwise it will panic + pub fn get_value(&self, index: usize) -> &[u8] { let view = self.views_builder.as_slice().get(index).unwrap(); let len = *view as u32; if len <= 12 { From 2f78b19f70347e2930571e5e7f8fddafbd2f8f84 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 12 Jul 2024 11:36:06 -0400 Subject: [PATCH 2/5] add memory accounting --- .../src/builder/generic_bytes_view_builder.rs | 13 +++++++++++++ arrow-buffer/src/builder/null.rs | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 5368a6cb77f..64eff60fb8e 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -338,6 +338,19 @@ impl GenericByteViewBuilder { pub fn validity_slice(&self) -> Option<&[u8]> { self.null_buffer_builder.as_slice() } + + /// Return the allocated size of this builder, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); + let in_progress = self.in_progress.capacity(); + let null = self.null_buffer_builder.allocated_size(); + let tracker = match &self.string_tracker { + Some((ht, _)) => ht.capacity() * std::mem::size_of::(), + None => 0, + }; + let views = self.views_builder.capacity() * std::mem::size_of::(); + buffer_size + in_progress + tracker + views + null + } } impl Default for GenericByteViewBuilder { diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs index 55b3303c9e3..90fd9958804 100644 --- a/arrow-buffer/src/builder/null.rs +++ b/arrow-buffer/src/builder/null.rs @@ -161,6 +161,14 @@ impl NullBufferBuilder { pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> { self.bitmap_builder.as_mut().map(|b| b.as_slice_mut()) } + + /// Return the allocated size of this builder, useful for memory accounting. + pub fn allocated_size(&self) -> usize { + self.bitmap_builder + .as_ref() + .map(|b| b.capacity()) + .unwrap_or(0) + } } impl NullBufferBuilder { From 2e3f1b7da5f3d259ed838cc97083fc991ac8dd42 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Sat, 13 Jul 2024 11:24:38 -0400 Subject: [PATCH 3/5] Update arrow-buffer/src/builder/null.rs Co-authored-by: Andrew Lamb --- arrow-buffer/src/builder/null.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs index 90fd9958804..a1cea6ef2cc 100644 --- a/arrow-buffer/src/builder/null.rs +++ b/arrow-buffer/src/builder/null.rs @@ -162,7 +162,7 @@ impl NullBufferBuilder { self.bitmap_builder.as_mut().map(|b| b.as_slice_mut()) } - /// Return the allocated size of this builder, useful for memory accounting. + /// Return the allocated size of this builder, in bytes, useful for memory accounting. pub fn allocated_size(&self) -> usize { self.bitmap_builder .as_ref() From 1063ad9e91526304d5efa2906a8b7c0f9e3bfe85 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Sat, 13 Jul 2024 11:24:44 -0400 Subject: [PATCH 4/5] Update arrow-array/src/builder/generic_bytes_view_builder.rs Co-authored-by: Andrew Lamb --- arrow-array/src/builder/generic_bytes_view_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 64eff60fb8e..ce00c031ddd 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -339,7 +339,7 @@ impl GenericByteViewBuilder { self.null_buffer_builder.as_slice() } - /// Return the allocated size of this builder, useful for memory accounting. + /// Return the allocated size of this builder in bytes, useful for memory accounting. pub fn allocated_size(&self) -> usize { let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); let in_progress = self.in_progress.capacity(); From eaef0ae812f660fdb41c472d651a72b0c817bce7 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Sat, 13 Jul 2024 11:27:21 -0400 Subject: [PATCH 5/5] update comments --- arrow-array/src/array/byte_view_array.rs | 3 +++ arrow-array/src/builder/generic_bytes_view_builder.rs | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index dc4cbe6834c..5ce150a7a34 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -324,6 +324,9 @@ impl GenericByteViewArray { /// Note that it will copy the array regardless of whether the original array is compact. /// Use with caution as this can be an expensive operation, only use it when you are sure that the view /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing. + /// + /// Note: this function does not attempt to canonicalize / deduplicate values. For this + /// feature see [`GenericByteViewBuilder::with_deduplicate_strings`]. pub fn gc(&self) -> Self { let mut builder = GenericByteViewBuilder::::with_capacity(self.len()); diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index ce00c031ddd..587255cc6b6 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -341,14 +341,14 @@ impl GenericByteViewBuilder { /// Return the allocated size of this builder in bytes, useful for memory accounting. pub fn allocated_size(&self) -> usize { + let views = self.views_builder.capacity() * std::mem::size_of::(); + let null = self.null_buffer_builder.allocated_size(); let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::(); let in_progress = self.in_progress.capacity(); - let null = self.null_buffer_builder.allocated_size(); let tracker = match &self.string_tracker { Some((ht, _)) => ht.capacity() * std::mem::size_of::(), None => 0, }; - let views = self.views_builder.capacity() * std::mem::size_of::(); buffer_size + in_progress + tracker + views + null } }