Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BitPackedArray::scalar_at #160

Merged
merged 20 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: jlumbroso/[email protected]
with:
tool-cache: false
large-packages: false
docker-images: false
swap-storage: false
- uses: ./.github/actions/setup-zig
- uses: ./.github/actions/setup-rust
- uses: ./.github/actions/setup-python
Expand Down
6 changes: 2 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 0 additions & 6 deletions fastlanez-sys/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@ links = "fastlanez"
[lints]
workspace = true

[dependencies]
arrayref = { workspace = true }
paste = { workspace = true }
seq-macro = { workspace = true }
uninit = { workspace = true }

[build-dependencies]
bindgen = { workspace = true }
walkdir = { workspace = true }
111 changes: 97 additions & 14 deletions fastlanez/src/bitpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,28 @@ use crate::{Pred, Satisfied};
/// BitPack into a compile-time known bit-width.
pub trait BitPack<const W: usize>
where
Self: Sized,
Self: Sized + Unsigned + PrimInt,
Pred<{ W > 0 }>: Satisfied,
Pred<{ W < 8 * size_of::<Self>() }>: Satisfied,
{
// fastlanez processes 1024 elements in chunks of 1024 bits at a time
const NUM_LANES: usize;
const MASK: Self;

/// Packs 1024 elements into W bits each -> (1024 * W / 8) -> 128 * W bytes
fn pack<'a>(
input: &[Self; 1024],
output: &'a mut [MaybeUninit<u8>; 128 * W],
) -> &'a [u8; 128 * W];

/// Unpacks 1024 elements that have been packed into W bits each
fn unpack<'a>(
input: &[u8; 128 * W],
output: &'a mut [MaybeUninit<Self>; 1024],
) -> &'a [Self; 1024];

/// Unpacks a single element (at provided index) that has been packed into W bits
fn unpack_single(input: &[u8; 128 * W], index: usize) -> Self;
}

#[derive(Debug)]
Expand Down Expand Up @@ -69,36 +78,88 @@ where
unsafe { output.set_len(output.len() + 1024) }
Ok(())
}

fn try_unpack_single(
input: &[u8],
width: usize,
index: usize,
) -> Result<Self, UnsupportedBitWidth>;
}

macro_rules! bitpack_impl {
($T:ty, $W:literal) => {
($T:ty, $BITS:literal) => {
paste::item! {
seq!(N in 1..$W {
impl BitPack<N> for $T {
seq!(W in 1..$BITS {
impl BitPack<W> for $T {
const NUM_LANES: usize = 128 / size_of::<$T>();
const MASK: $T = ((1 as $T) << W) - 1;

#[inline]
fn pack<'a>(
input: &[Self; 1024],
output: &'a mut [MaybeUninit<u8>; 128 * N],
) -> &'a [u8; 128 * N] {
output: &'a mut [MaybeUninit<u8>; 128 * W],
) -> &'a [u8; 128 * W] {
unsafe {
let output_array: &mut [u8; 128 * N] = std::mem::transmute(output);
[<fl_bitpack_ $T _u >]~N(input, output_array);
let output_array: &mut [u8; 128 * W] = std::mem::transmute(output);
[<fl_bitpack_ $T _u >]~W(input, output_array);
output_array
}
}

#[inline]
fn unpack<'a>(
input: &[u8; 128 * N],
input: &[u8; 128 * W],
output: &'a mut [MaybeUninit<Self>; 1024],
) -> &'a [Self; 1024] {
unsafe {
let output_array: &mut [Self; 1024] = std::mem::transmute(output);
[<fl_bitunpack_ $T _u >]~N(input, output_array);
[<fl_bitunpack_ $T _u >]~W(input, output_array);
output_array
}
}

#[inline]
fn unpack_single(
input: &[u8; 128 * W],
index: usize
) -> Self {
// lane_index is the index of the row
let lane_index = index % <$T as BitPack<W>>::NUM_LANES;
// lane_start_bit is the bit offset in the combined columns of the row
let lane_start_bit = (index / <$T as BitPack<W>>::NUM_LANES) * W;

let words: [Self; 2] = {
// each tranche is laid out as a column-major 2D array of words
// there are `num_lanes` rows (lanes), each of which contains `packed_bit_width` columns (words) of type T
let tranche_words = unsafe {
std::slice::from_raw_parts(
input.as_ptr() as *const Self,
input.len() / std::mem::size_of::<Self>(),
)
};

// the value may be split across two words
let lane_start_word = lane_start_bit / ($T::BITS as usize);
let lane_end_word_inclusive = (lane_start_bit + W - 1) / ($T::BITS as usize);

[
tranche_words[lane_start_word * <$T as BitPack<W>>::NUM_LANES + lane_index],
tranche_words[lane_end_word_inclusive * <$T as BitPack<W>>::NUM_LANES + lane_index], // this may be a duplicate
]
};

let start_bit = lane_start_bit % ($T::BITS as usize);
let bits_left_in_first_word = ($T::BITS as usize) - start_bit;
if bits_left_in_first_word >= W {
// all the bits we need are in the same word
(words[0] >> start_bit) & <$T as BitPack<W>>::MASK
} else {
// we need to use two words
let lo = words[0] >> start_bit;
let hi = words[1] << bits_left_in_first_word;
(lo | hi) & <$T as BitPack<W>>::MASK
}
}
}
});
}
Expand All @@ -109,9 +170,9 @@ macro_rules! bitpack_impl {
width: usize,
output: &'a mut [MaybeUninit<u8>],
) -> Result<&'a [u8], UnsupportedBitWidth> {
seq!(N in 1..$W {
seq!(W in 1..$BITS {
match width {
#(N => Ok(BitPack::<N>::pack(input, array_mut_ref![output, 0, N * 128]).as_slice()),)*
#(W => Ok(BitPack::<W>::pack(input, array_mut_ref![output, 0, W * 128]).as_slice()),)*
_ => Err(UnsupportedBitWidth),
}
})
Expand All @@ -122,9 +183,18 @@ macro_rules! bitpack_impl {
width: usize,
output: &'a mut [MaybeUninit<Self>; 1024],
) -> Result<&'a [Self; 1024], UnsupportedBitWidth> {
seq!(N in 1..$W {
seq!(W in 1..$BITS {
match width {
#(N => Ok(BitPack::<N>::unpack(array_ref![input, 0, N * 128], output)),)*
#(W => Ok(BitPack::<W>::unpack(array_ref![input, 0, W * 128], output)),)*
_ => Err(UnsupportedBitWidth),
}
})
}

fn try_unpack_single(input: &[u8], width: usize, index: usize) -> Result<Self, UnsupportedBitWidth> {
seq!(W in 1..$BITS {
match width {
#(W => Ok(BitPack::<W>::unpack_single(array_ref![input, 0, W * 128], index)),)*
_ => Err(UnsupportedBitWidth),
}
})
Expand Down Expand Up @@ -153,4 +223,17 @@ mod test {
TryBitPack::try_unpack_into(&output, 10, &mut decoded).unwrap();
assert_eq!(input, decoded);
}

#[test]
fn test_unpack_single() {
let input = (0u32..1024).collect::<Vec<_>>();
let mut output = Vec::new();
TryBitPack::try_pack_into(array_ref![input, 0, 1024], 10, &mut output).unwrap();
assert_eq!(output.len(), 1280);

input.iter().enumerate().for_each(|(i, v)| {
let decoded = <u32 as TryBitPack>::try_unpack_single(&output, 10, i).unwrap();
assert_eq!(decoded, *v);
});
}
}
5 changes: 0 additions & 5 deletions fastlanez/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
#![allow(incomplete_features)]
#![feature(generic_const_exprs)]
#![feature(maybe_uninit_uninit_array)]
#![feature(maybe_uninit_array_assume_init)]
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]

pub use bitpack::*;
pub use delta::*;
Expand Down
46 changes: 24 additions & 22 deletions fastlanez/src/transpose.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::mem::size_of;
use std::mem::{size_of, MaybeUninit};

use arrayref::array_mut_ref;
use fastlanez_sys::{
Expand All @@ -7,20 +7,10 @@ use fastlanez_sys::{
};
use uninit::prelude::VecCapacity;

const fn transposable<T: Sized, U: Sized>() -> bool {
let sizeOfT = size_of::<T>();
sizeOfT == size_of::<U>() && (sizeOfT == 1 || sizeOfT == 2 || sizeOfT == 4 || sizeOfT == 8)
}

pub fn transpose<T: Sized, U: Sized>(input: &[T; 1024], output: &mut [U; 1024]) {
assert!(
transposable::<T, U>(),
"Cannot transpose {} into {}",
std::any::type_name::<T>(),
std::any::type_name::<U>()
);
pub fn transpose<T: Sized, U: Transposable<T>>(input: &[T; 1024], output: &mut [U; 1024]) {
unsafe {
match size_of::<T>() {
// referencing U::SIZE forces a compile time size check; it is equal to size_of::<T>()
match U::SIZE {
1 => fl_transpose_u8(
input.as_ptr() as *const [u8; 1024],
output.as_ptr() as *mut [u8; 1024],
Expand Down Expand Up @@ -50,15 +40,10 @@ pub fn transpose_into<T: Sized>(input: &[T; 1024], output: &mut Vec<T>) {
}
}

pub fn untranspose<T: Sized, U: Sized>(input: &[T; 1024], output: &mut [U; 1024]) {
assert!(
transposable::<T, U>(),
"Cannot untranspose {} into {}",
std::any::type_name::<T>(),
std::any::type_name::<U>()
);
pub fn untranspose<T: Sized, U: Transposable<T>>(input: &[T; 1024], output: &mut [U; 1024]) {
unsafe {
match size_of::<T>() {
// referencing U::SIZE forces a compile time size check; it is equal to size_of::<T>()
match U::SIZE {
1 => fl_untranspose_u8(
input.as_ptr() as *const [u8; 1024],
output.as_mut_ptr() as *mut [u8; 1024],
Expand Down Expand Up @@ -87,6 +72,23 @@ pub fn untranspose_into<T: Sized>(input: &[T; 1024], output: &mut Vec<T>) {
}
}

pub trait Transposable<T: Sized> {
// must be referenced to force compile-time size checking
const SIZE: usize = {
assert!(
size_of::<T>() == 1
|| size_of::<T>() == 2
|| size_of::<T>() == 4
|| size_of::<T>() == 8,
"T must be 1, 2, 4 or 8 bytes in size"
);
size_of::<T>()
};
}

impl<T: Sized> Transposable<T> for T {}
impl<T: Sized> Transposable<T> for MaybeUninit<T> {}

#[cfg(test)]
mod test {
use arrayref::array_ref;
Expand Down
6 changes: 6 additions & 0 deletions vortex-fastlanes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,10 @@ fastlanez = { path = "../fastlanez" }
log = { workspace = true }

[dev-dependencies]
criterion = { workspace = true }
rand = { workspace = true }
simplelog = { workspace = true }

[[bench]]
name = "bitpacking"
harness = false
57 changes: 57 additions & 0 deletions vortex-fastlanes/benches/bitpacking.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use fastlanez::TryBitPack;
use rand::distributions::Uniform;
use rand::{thread_rng, Rng};
use vortex_fastlanes::{bitpack_primitive, unpack_primitive, unpack_single_primitive};

fn values(len: usize, bits: usize) -> Vec<u32> {
let rng = thread_rng();
let range = Uniform::new(0_u32, 2_u32.pow(bits as u32));
rng.sample_iter(range).take(len).collect()
}

fn unpack_singles(packed: &[u8], bit_width: usize, length: usize) -> Vec<u32> {
let mut output = Vec::with_capacity(length);
for i in 0..length {
unsafe {
output.push(unpack_single_primitive(packed, bit_width, i).unwrap());
}
}
output
}

fn pack_unpack(c: &mut Criterion) {
let bits: usize = 8;
let values = values(1_000_000, bits);

c.bench_function("bitpack_1M", |b| {
b.iter(|| black_box(bitpack_primitive(&values, bits)));
});

let packed = bitpack_primitive(&values, bits);
c.bench_function("unpack_1M", |b| {
b.iter(|| black_box(unpack_primitive::<u32>(&packed, bits, values.len())));
});

c.bench_function("unpack_1M_singles", |b| {
b.iter(|| black_box(unpack_singles(&packed, 8, values.len())));
});

// 1024 elements pack into `128 * bits` bytes
let packed_1024 = &packed[0..128 * bits];
let mut output: Vec<u32> = Vec::with_capacity(1024);
c.bench_function("unpack_1024", |b| {
b.iter(|| {
output.clear();
TryBitPack::try_unpack_into(packed_1024, bits, &mut output).unwrap();
black_box(output[0])
})
});

c.bench_function("unpack_single", |b| {
b.iter(|| black_box(unsafe { unpack_single_primitive::<u32>(packed_1024, 8, 0) }));
});
}

criterion_group!(benches, pack_unpack);
criterion_main!(benches);
Loading