Skip to content

Commit

Permalink
Add simple GC for view array types (#5885)
Browse files Browse the repository at this point in the history
* feat: support gc for view_arrays

part 1: implement checker to check if current buffer is compact

Signed-off-by: 蔡略 <[email protected]>

* feat: add gc to view-arrays

part2: implement actual gc algorithm

Signed-off-by: 蔡略 <[email protected]>

* patch: add more tests

Signed-off-by: 蔡略 <[email protected]>

* patch: improve robustness

Signed-off-by: 蔡略 <[email protected]>

* simplify gc

* make fmt happy

* Update arrow-array/src/array/byte_view_array.rs

Co-authored-by: Andrew Lamb <[email protected]>

* improve testing

* check in a benchmark

* actually have multiple buffers in test

---------

Signed-off-by: 蔡略 <[email protected]>
Co-authored-by: 蔡略 <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
3 people authored Jun 14, 2024
1 parent e8285be commit 956fe76
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 0 deletions.
4 changes: 4 additions & 0 deletions arrow-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,7 @@ criterion = { version = "0.5", default-features = false }
[[bench]]
name = "occupancy"
harness = false

[[bench]]
name = "gc_view_types"
harness = false
48 changes: 48 additions & 0 deletions arrow-array/benches/gc_view_types.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_array::StringViewArray;
use criterion::*;

fn gen_view_array(size: usize) -> StringViewArray {
StringViewArray::from_iter((0..size).map(|v| match v % 3 {
0 => Some("small"),
1 => Some("larger than 12 bytes array"),
2 => None,
_ => unreachable!("unreachable"),
}))
}

fn criterion_benchmark(c: &mut Criterion) {
let array = gen_view_array(100_000);

c.bench_function("gc view types all", |b| {
b.iter(|| {
black_box(array.gc());
});
});

let sliced = array.slice(0, 100_000 / 2);
c.bench_function("gc view types slice half", |b| {
b.iter(|| {
black_box(sliced.gc());
});
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
85 changes: 85 additions & 0 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,56 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
phantom: Default::default(),
}
}

/// Returns a "compacted" version of this array
///
/// The original array will *not* be modified
///
/// # Garbage Collection
///
/// Before GC:
/// ```text
/// ┌──────┐
/// │......│
/// │......│
/// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
/// │ View 1 │─ ─ ─ ─ │......│ with data that
/// ├────────────────────┤ │......│ is not referred
/// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
/// └────────────────────┘ │......│ View 2
/// │......│
/// 2 views, refer to │......│
/// small portions of a └──────┘
/// large buffer
/// ```
///
/// After GC:
///
/// ```text
/// ┌────────────────────┐ ┌─────┐ After gc, only
/// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
/// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
/// │ View 2 │─ ─ ─ ─ └─────┘ the views is
/// └────────────────────┘ left
///
///
/// 2 views
/// ```
/// This method will compact the data buffers by recreating the view array and only include the data
/// that is pointed to by the views.
///
/// Note that it will copy the array regardless of whether the original array is compact.
/// Use with caution as this can be an expensive operation, only use it when you are sure that the view
/// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
pub fn gc(&self) -> Self {
let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());

for v in self.iter() {
builder.append_option(v);
}

builder.finish()
}
}

impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
Expand Down Expand Up @@ -645,4 +695,39 @@ mod tests {

StringViewArray::new(views, buffers, None);
}

#[test]
fn test_gc() {
let test_data = [
Some("longer than 12 bytes"),
Some("short"),
Some("t"),
Some("longer than 12 bytes"),
None,
Some("short"),
];

let array = {
let mut builder = StringViewBuilder::new().with_block_size(8); // create multiple buffers
test_data.into_iter().for_each(|v| builder.append_option(v));
builder.finish()
};
assert!(array.buffers.len() > 1);

fn check_gc(to_test: &StringViewArray) {
let gc = to_test.gc();
assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());

to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
assert_eq!(a, b);
});
assert_eq!(to_test.len(), gc.len());
}

check_gc(&array);
check_gc(&array.slice(1, 3));
check_gc(&array.slice(2, 1));
check_gc(&array.slice(2, 2));
check_gc(&array.slice(3, 1));
}
}

0 comments on commit 956fe76

Please sign in to comment.