From f3a39c670a00ef2ef22d19c9b023fd53996197fe Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 3 Jul 2024 12:02:58 -0400 Subject: [PATCH 01/16] wip on gcd const --- benches/bitpacking.rs | 18 +++++++- src/bitpacking.rs | 95 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/benches/bitpacking.rs b/benches/bitpacking.rs index f29402d..b1f425e 100644 --- a/benches/bitpacking.rs +++ b/benches/bitpacking.rs @@ -46,7 +46,7 @@ fn pack(c: &mut Criterion) { { let mut group = c.benchmark_group("unpack-single"); - group.bench_function("unpack single 16 <- 3", |b| { + group.bench_function("unpack_single 16 <- 3", |b| { const WIDTH: usize = 3; let values = vec![3u16; 1024]; let mut packed = vec![0; 128 * WIDTH / size_of::()]; @@ -61,6 +61,22 @@ fn pack(c: &mut Criterion) { } }); }); + + group.bench_function("unpack_single2 16 <- 3", |b| { + const WIDTH: usize = 3; + let values = vec![3u16; 1024]; + let mut packed = vec![0; 128 * WIDTH / size_of::()]; + BitPacking::pack::(array_ref![values, 0, 1024], array_mut_ref![packed, 0, 192]); + + b.iter(|| { + for i in 0..1024 { + black_box::(BitPacking::unpack_single2::( + array_ref![packed, 0, 192], + i, + )); + } + }); + }); } } diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 89c2e32..5ebad92 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -99,6 +99,101 @@ pub trait BitPacking: FastLanes { (lo | hi) & mask } + /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. + fn unpack_single2(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self + where + BitPackWidth: SupportedBitPackWidth, + { + // Special case for W=0, since there's only one possible value. + if W == 0 { + return Self::zero(); + } + + // We can think of the input array as effectively a row-major, left-to-right + // 2-D array of with `Self::LANES` columns and `Self::T` rows. + // + // Meanwhile, we can think of the packed array as either: + // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns + // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns + // + // Bitpacking involves a transposition of the input array ordering, such that + // decompression can be fused efficiently with encodings like delta and RLE. + // + // First step, we need to get the lane and row for interpretation #1 above. + let (lane, row): (usize, usize) = seq!(I in 0..1024 { + match index { + #(I => + Self::packed_lane_and_row::(), + )* + _ => unreachable!("Unsupported index: {}", index) + } + }); + + // From the row, we can get the correct start bit within the lane. + let start_bit = row * W; + let start_word = (start_bit) / Self::T; + let end_word = (start_bit + W - 1) / Self::T; + let one_word = (start_word == end_word); + let mask: Self = if W == Self::T { + Self::max_value() + } else { + ((Self::one()) << (W % Self::T)) - Self::one() + }; + + + unsafe { Self::unpack_single_const_helper::(packed, mask) } + } + + fn packed_lane_and_row() -> (usize, usize) { + // We can think of the input array as effectively a row-major, left-to-right + // 2-D array of with `Self::LANES` columns and `Self::T` rows. + // + // Meanwhile, we can think of the packed array as either: + // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns + // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns + // + // Bitpacking involves a transposition of the input array ordering, such that + // decompression can be fused efficiently with encodings like delta and RLE. + // + // First step, we need to get the lane and row for interpretation #1 above. + let lane = INDEX % Self::LANES; + let row = { + // This is the inverse of the `index` function from the pack/unpack macros: + // fn index(row: usize, lane: usize) -> usize { + // let o = row / 8; + // let s = row % 8; + // (FL_ORDER[o] * 16) + (s * 128) + lane + // } + let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 + let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] + let o = FL_ORDER[fl_order]; // because this transposition is invertible! + o * 8 + s + }; + (lane, row) + } + + /// Unpacks a single element at the provided LANE and START_BIT from a packed array of 1024 `W` bit elements, + /// where `W` is runtime-known instead of compile-time known. + /// + /// # Safety + /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W` + /// is the packed width. The output slice must be of exactly length 1024. + /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds). + unsafe fn unpack_single_const_helper( + packed: &[Self], mask: Self) -> Self + { + let start_word = START_BIT / Self::T; + let lo_shift = START_BIT % Self::T; + let lo = packed[Self::LANES * start_word + LANE] >> lo_shift; + if ONE_WORD { + lo & mask + } else { + let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false + let hi = packed[Self::LANES * (start_word + 1) + LANE] << hi_shift; + (lo | hi) & mask + } + } + /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements, /// where `W` is runtime-known instead of compile-time known. /// From aac0ddc65358f1ed9fa4d526574db747aca480f6 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 4 Jul 2024 15:31:21 -0400 Subject: [PATCH 02/16] wip --- src/bitpacking.rs | 121 +++++++++++++++++++++++++++++----------------- src/lib.rs | 13 +++++ 2 files changed, 89 insertions(+), 45 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 5ebad92..f4ea35d 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -1,8 +1,9 @@ use arrayref::{array_mut_ref, array_ref}; use core::mem::size_of; use paste::paste; +use seq_macro::seq; -use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; +use crate::{pack, seq_t, seq_start_bits, seq_lanes, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; pub struct BitPackWidth; pub trait SupportedBitPackWidth {} @@ -102,47 +103,7 @@ pub trait BitPacking: FastLanes { /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. fn unpack_single2(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self where - BitPackWidth: SupportedBitPackWidth, - { - // Special case for W=0, since there's only one possible value. - if W == 0 { - return Self::zero(); - } - - // We can think of the input array as effectively a row-major, left-to-right - // 2-D array of with `Self::LANES` columns and `Self::T` rows. - // - // Meanwhile, we can think of the packed array as either: - // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns - // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns - // - // Bitpacking involves a transposition of the input array ordering, such that - // decompression can be fused efficiently with encodings like delta and RLE. - // - // First step, we need to get the lane and row for interpretation #1 above. - let (lane, row): (usize, usize) = seq!(I in 0..1024 { - match index { - #(I => - Self::packed_lane_and_row::(), - )* - _ => unreachable!("Unsupported index: {}", index) - } - }); - - // From the row, we can get the correct start bit within the lane. - let start_bit = row * W; - let start_word = (start_bit) / Self::T; - let end_word = (start_bit + W - 1) / Self::T; - let one_word = (start_word == end_word); - let mask: Self = if W == Self::T { - Self::max_value() - } else { - ((Self::one()) << (W % Self::T)) - Self::one() - }; - - - unsafe { Self::unpack_single_const_helper::(packed, mask) } - } + BitPackWidth: SupportedBitPackWidth; fn packed_lane_and_row() -> (usize, usize) { // We can think of the input array as effectively a row-major, left-to-right @@ -181,6 +142,9 @@ pub trait BitPacking: FastLanes { /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds). unsafe fn unpack_single_const_helper( packed: &[Self], mask: Self) -> Self + where + Pred< { START_BIT < Self::T * Self::T }> : Satisfied, + Pred< { LANE < Self::LANES }> : Satisfied { let start_word = START_BIT / Self::T; let lo_shift = START_BIT % Self::T; @@ -276,6 +240,73 @@ macro_rules! impl_packing { }) } + /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. + fn unpack_single2(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self + where + BitPackWidth: SupportedBitPackWidth, + { + // Special case for W=0, since there's only one possible value. + if W == 0 { + return 0 as $T; + } + + // We can think of the input array as effectively a row-major, left-to-right + // 2-D array of with `Self::LANES` columns and `Self::T` rows. + // + // Meanwhile, we can think of the packed array as either: + // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns + // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns + // + // Bitpacking involves a transposition of the input array ordering, such that + // decompression can be fused efficiently with encodings like delta and RLE. + // + // First step, we need to get the lane and row for interpretation #1 above. + let (lane, row): (usize, usize) = seq!(I in 0..1024 { + match index { + #(I => + Self::packed_lane_and_row::(), + )* + _ => unreachable!("Unsupported index: {}", index) + } + }); + + // From the row, we can get the correct start bit within the lane. + let start_bit = row * W; + let start_word = (start_bit) / Self::T; + let end_word = (start_bit + W - 1) / Self::T; + let one_word = (start_word == end_word); + + #[inline] + fn mask(width: usize) -> $T { + if width == $T::T { <$T>::MAX } else { (1 << (width % $T::T)) - 1 } + } + + seq_t!(W, ROW in $T, |W| { + match start_bit { + #(ROW * W => { + seq_lanes!(LANE in $T { + match lane { + #(LANE => { + match one_word { + true => unsafe { $T::unpack_single_const_helper::(packed, mask(W)) }, + false => unsafe { $T::unpack_single_const_helper::(packed, mask(W)) }, + } + })* + _ => unreachable!( + "Unsupported lane: {}", + lane + ) + } + }) + })* + _ => unreachable!( + "Unsupported start_bit: {}", + start_bit + ) + } + }) + } + unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self { let packed_len = 128 * width / size_of::(); debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len); @@ -304,10 +335,10 @@ macro_rules! impl_packing { }; } -impl_packing!(u8); +//impl_packing!(u8); impl_packing!(u16); -impl_packing!(u32); -impl_packing!(u64); +//impl_packing!(u32); +//impl_packing!(u64); #[cfg(test)] mod test { diff --git a/src/lib.rs b/src/lib.rs index 4bf858d..bf5f689 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,6 +44,19 @@ macro_rules! seq_t { ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)}; ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; + ($W:expr, $ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)}; + ($W:expr, $ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)}; + ($W:expr, $ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; + ($W:expr, $ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; +} + +// Macro for repeating a code block T::LANES times. +#[macro_export] +macro_rules! seq_lanes { + ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..128 $body)}; + ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; + ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; + ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)}; } #[cfg(test)] From 30fda6dc5af3315b3e5c786acb8a86fae46ea03e Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 4 Jul 2024 15:33:53 -0400 Subject: [PATCH 03/16] minimal improvement --- benches/bitpacking.rs | 18 +------- src/bitpacking.rs | 101 +++++------------------------------------- src/lib.rs | 13 ------ 3 files changed, 13 insertions(+), 119 deletions(-) diff --git a/benches/bitpacking.rs b/benches/bitpacking.rs index b1f425e..f29402d 100644 --- a/benches/bitpacking.rs +++ b/benches/bitpacking.rs @@ -46,7 +46,7 @@ fn pack(c: &mut Criterion) { { let mut group = c.benchmark_group("unpack-single"); - group.bench_function("unpack_single 16 <- 3", |b| { + group.bench_function("unpack single 16 <- 3", |b| { const WIDTH: usize = 3; let values = vec![3u16; 1024]; let mut packed = vec![0; 128 * WIDTH / size_of::()]; @@ -61,22 +61,6 @@ fn pack(c: &mut Criterion) { } }); }); - - group.bench_function("unpack_single2 16 <- 3", |b| { - const WIDTH: usize = 3; - let values = vec![3u16; 1024]; - let mut packed = vec![0; 128 * WIDTH / size_of::()]; - BitPacking::pack::(array_ref![values, 0, 1024], array_mut_ref![packed, 0, 192]); - - b.iter(|| { - for i in 0..1024 { - black_box::(BitPacking::unpack_single2::( - array_ref![packed, 0, 192], - i, - )); - } - }); - }); } } diff --git a/src/bitpacking.rs b/src/bitpacking.rs index f4ea35d..f97db6f 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -3,7 +3,7 @@ use core::mem::size_of; use paste::paste; use seq_macro::seq; -use crate::{pack, seq_t, seq_start_bits, seq_lanes, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; +use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; pub struct BitPackWidth; pub trait SupportedBitPackWidth {} @@ -64,19 +64,14 @@ pub trait BitPacking: FastLanes { // decompression can be fused efficiently with encodings like delta and RLE. // // First step, we need to get the lane and row for interpretation #1 above. - let lane = index % Self::LANES; - let row = { - // This is the inverse of the `index` function from the pack/unpack macros: - // fn index(row: usize, lane: usize) -> usize { - // let o = row / 8; - // let s = row % 8; - // (FL_ORDER[o] * 16) + (s * 128) + lane - // } - let s = index / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 - let fl_order = (index - s * 128 - lane) / 16; // value of FL_ORDER[o] - let o = FL_ORDER[fl_order]; // because this transposition is invertible! - o * 8 + s - }; + let (lane, row): (usize, usize) = seq!(I in 0..1024 { + match index { + #(I => + Self::packed_lane_and_row::(), + )* + _ => unreachable!("Unsupported index: {}", index) + } + }); // From the row, we can get the correct start bit within the lane. let start_bit = row * W; @@ -100,11 +95,6 @@ pub trait BitPacking: FastLanes { (lo | hi) & mask } - /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. - fn unpack_single2(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self - where - BitPackWidth: SupportedBitPackWidth; - fn packed_lane_and_row() -> (usize, usize) { // We can think of the input array as effectively a row-major, left-to-right // 2-D array of with `Self::LANES` columns and `Self::T` rows. @@ -240,73 +230,6 @@ macro_rules! impl_packing { }) } - /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. - fn unpack_single2(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self - where - BitPackWidth: SupportedBitPackWidth, - { - // Special case for W=0, since there's only one possible value. - if W == 0 { - return 0 as $T; - } - - // We can think of the input array as effectively a row-major, left-to-right - // 2-D array of with `Self::LANES` columns and `Self::T` rows. - // - // Meanwhile, we can think of the packed array as either: - // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns - // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns - // - // Bitpacking involves a transposition of the input array ordering, such that - // decompression can be fused efficiently with encodings like delta and RLE. - // - // First step, we need to get the lane and row for interpretation #1 above. - let (lane, row): (usize, usize) = seq!(I in 0..1024 { - match index { - #(I => - Self::packed_lane_and_row::(), - )* - _ => unreachable!("Unsupported index: {}", index) - } - }); - - // From the row, we can get the correct start bit within the lane. - let start_bit = row * W; - let start_word = (start_bit) / Self::T; - let end_word = (start_bit + W - 1) / Self::T; - let one_word = (start_word == end_word); - - #[inline] - fn mask(width: usize) -> $T { - if width == $T::T { <$T>::MAX } else { (1 << (width % $T::T)) - 1 } - } - - seq_t!(W, ROW in $T, |W| { - match start_bit { - #(ROW * W => { - seq_lanes!(LANE in $T { - match lane { - #(LANE => { - match one_word { - true => unsafe { $T::unpack_single_const_helper::(packed, mask(W)) }, - false => unsafe { $T::unpack_single_const_helper::(packed, mask(W)) }, - } - })* - _ => unreachable!( - "Unsupported lane: {}", - lane - ) - } - }) - })* - _ => unreachable!( - "Unsupported start_bit: {}", - start_bit - ) - } - }) - } - unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self { let packed_len = 128 * width / size_of::(); debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len); @@ -335,10 +258,10 @@ macro_rules! impl_packing { }; } -//impl_packing!(u8); +impl_packing!(u8); impl_packing!(u16); -//impl_packing!(u32); -//impl_packing!(u64); +impl_packing!(u32); +impl_packing!(u64); #[cfg(test)] mod test { diff --git a/src/lib.rs b/src/lib.rs index bf5f689..4bf858d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,19 +44,6 @@ macro_rules! seq_t { ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)}; ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; - ($W:expr, $ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)}; - ($W:expr, $ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)}; - ($W:expr, $ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; - ($W:expr, $ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; -} - -// Macro for repeating a code block T::LANES times. -#[macro_export] -macro_rules! seq_lanes { - ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..128 $body)}; - ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)}; - ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)}; - ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)}; } #[cfg(test)] From e2471f495c877fee25e65610b3d0574f110937b9 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Mon, 15 Jul 2024 17:51:49 +0100 Subject: [PATCH 04/16] wip --- src/bitpacking.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index f97db6f..1e4c04d 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -95,6 +95,7 @@ pub trait BitPacking: FastLanes { (lo | hi) & mask } + #[must_use] fn packed_lane_and_row() -> (usize, usize) { // We can think of the input array as effectively a row-major, left-to-right // 2-D array of with `Self::LANES` columns and `Self::T` rows. @@ -123,27 +124,28 @@ pub trait BitPacking: FastLanes { (lane, row) } - /// Unpacks a single element at the provided LANE and START_BIT from a packed array of 1024 `W` bit elements, - /// where `W` is runtime-known instead of compile-time known. + /// Unpacks a single element at the provided `START_BIT` of the (runtime-specified) lane from a + /// packed array of 1024 `W` bit elements, where `W` is runtime-known (specified via the mask) + /// instead of compile-time known. The point of this function is to produce a reusable block of + /// code that balances compile-time optimization with code size. /// /// # Safety /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W` /// is the packed width. The output slice must be of exactly length 1024. /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds). - unsafe fn unpack_single_const_helper( - packed: &[Self], mask: Self) -> Self + unsafe fn unpack_single_const_helper( + packed: &[Self], lane: usize, mask: Self) -> Self where - Pred< { START_BIT < Self::T * Self::T }> : Satisfied, - Pred< { LANE < Self::LANES }> : Satisfied + Pred< { START_BIT < Self::T * Self::T }> : Satisfied { let start_word = START_BIT / Self::T; let lo_shift = START_BIT % Self::T; - let lo = packed[Self::LANES * start_word + LANE] >> lo_shift; + let lo = packed[Self::LANES * start_word + lane] >> lo_shift; if ONE_WORD { lo & mask } else { let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false - let hi = packed[Self::LANES * (start_word + 1) + LANE] << hi_shift; + let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift; (lo | hi) & mask } } From b536d27a1b42a0f440a8e45427b4084c99aa9d10 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Tue, 16 Jul 2024 12:50:07 +0100 Subject: [PATCH 05/16] wip --- src/bitpacking.rs | 111 ++++++++++++++++++++++++---------------------- src/lib.rs | 1 + 2 files changed, 60 insertions(+), 52 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 1e4c04d..d2a7fab 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -46,54 +46,7 @@ pub trait BitPacking: FastLanes { /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. fn unpack_single(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self where - BitPackWidth: SupportedBitPackWidth, - { - // Special case for W=0, since there's only one possible value. - if W == 0 { - return Self::zero(); - } - - // We can think of the input array as effectively a row-major, left-to-right - // 2-D array of with `Self::LANES` columns and `Self::T` rows. - // - // Meanwhile, we can think of the packed array as either: - // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns - // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns - // - // Bitpacking involves a transposition of the input array ordering, such that - // decompression can be fused efficiently with encodings like delta and RLE. - // - // First step, we need to get the lane and row for interpretation #1 above. - let (lane, row): (usize, usize) = seq!(I in 0..1024 { - match index { - #(I => - Self::packed_lane_and_row::(), - )* - _ => unreachable!("Unsupported index: {}", index) - } - }); - - // From the row, we can get the correct start bit within the lane. - let start_bit = row * W; - - // We need to read one or two T-bit words from the lane, depending on how our - // target W-bit value overlaps with the T-bit words. To avoid a branch, we - // always read two T-bit words, and then shift/mask as needed. - let lo_word = start_bit / Self::T; - let lo_shift = start_bit % Self::T; - let lo = packed[Self::LANES * lo_word + lane] >> lo_shift; - - let hi_word = (start_bit + W - 1) / Self::T; - let hi_shift = (Self::T - lo_shift) % Self::T; - let hi = packed[Self::LANES * hi_word + lane] << hi_shift; - - let mask: Self = if W == Self::T { - Self::max_value() - } else { - ((Self::one()) << (W % Self::T)) - Self::one() - }; - (lo | hi) & mask - } + BitPackWidth: SupportedBitPackWidth; #[must_use] fn packed_lane_and_row() -> (usize, usize) { @@ -124,6 +77,15 @@ pub trait BitPacking: FastLanes { (lane, row) } + #[must_use] + fn mask() -> Self { + return if W == Self::T { + Self::max_value() + } else { + ((Self::one()) << (W % Self::T)) - Self::one() + }; + } + /// Unpacks a single element at the provided `START_BIT` of the (runtime-specified) lane from a /// packed array of 1024 `W` bit elements, where `W` is runtime-known (specified via the mask) /// instead of compile-time known. The point of this function is to produce a reusable block of @@ -136,7 +98,7 @@ pub trait BitPacking: FastLanes { unsafe fn unpack_single_const_helper( packed: &[Self], lane: usize, mask: Self) -> Self where - Pred< { START_BIT < Self::T * Self::T }> : Satisfied + Pred< { START_BIT < Self::BITS_PER_LANE }> : Satisfied { let start_word = START_BIT / Self::T; let lo_shift = START_BIT % Self::T; @@ -232,6 +194,51 @@ macro_rules! impl_packing { }) } + /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. + fn unpack_single(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self + where + BitPackWidth: SupportedBitPackWidth, + { + // Special case for W=0, since there's only one possible value. + if W == 0 { + return 0 as $T; + } + + // We can think of the input array as effectively a row-major, left-to-right + // 2-D array of with `Self::LANES` columns and `Self::T` rows. + // + // Meanwhile, we can think of the packed array as either: + // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns + // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns + // + // Bitpacking involves a transposition of the input array ordering, such that + // decompression can be fused efficiently with encodings like delta and RLE. + // + // First step, we need to get the lane and row for interpretation #1 above. + let (lane, row): (usize, usize) = seq!(I in 0..1024 { + match index { + #(I => + Self::packed_lane_and_row::(), + )* + _ => unreachable!("Unsupported index: {}", index) + } + }); + + let mask = Self::mask::(); + + seq_t!(ROW in $T { + match row { + #(ROW => { + const start_bit = ROW * W; + const remaining_bits: usize = $T::T - (start_bit % $T::T); + const one_word: bool = remaining_bits <= W; + return Self::unpack_single_const_helper::(packed, lane, mask); + },)* + _ => unreachable!("Unsupported row: {}", row) + } + }) + } + unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self { let packed_len = 128 * width / size_of::(); debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len); @@ -260,10 +267,10 @@ macro_rules! impl_packing { }; } -impl_packing!(u8); -impl_packing!(u16); +//impl_packing!(u8); +//impl_packing!(u16); impl_packing!(u32); -impl_packing!(u64); +// impl_packing!(u64); #[cfg(test)] mod test { diff --git a/src/lib.rs b/src/lib.rs index 4bf858d..fb18aaf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,6 +24,7 @@ pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7]; pub trait FastLanes: Sized + Unsigned + PrimInt { const T: usize = size_of::() * 8; const LANES: usize = 1024 / Self::T; + const BITS_PER_LANE: usize = Self::T * Self::T; } impl FastLanes for u8 {} From aecf42fcb25f899a1800ba9c64c8bb8b5ebbded4 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 17 Jul 2024 14:13:17 +0100 Subject: [PATCH 06/16] I hate macros --- src/bitpacking.rs | 107 ++-------------------------------------------- src/lib.rs | 1 - src/macros.rs | 81 +++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 104 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index d2a7fab..a06c267 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -1,9 +1,8 @@ use arrayref::{array_mut_ref, array_ref}; use core::mem::size_of; use paste::paste; -use seq_macro::seq; -use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; +use crate::{pack, seq_t, unpack, unpack_single, FastLanes, Pred, Satisfied}; pub struct BitPackWidth; pub trait SupportedBitPackWidth {} @@ -48,70 +47,6 @@ pub trait BitPacking: FastLanes { where BitPackWidth: SupportedBitPackWidth; - #[must_use] - fn packed_lane_and_row() -> (usize, usize) { - // We can think of the input array as effectively a row-major, left-to-right - // 2-D array of with `Self::LANES` columns and `Self::T` rows. - // - // Meanwhile, we can think of the packed array as either: - // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns - // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns - // - // Bitpacking involves a transposition of the input array ordering, such that - // decompression can be fused efficiently with encodings like delta and RLE. - // - // First step, we need to get the lane and row for interpretation #1 above. - let lane = INDEX % Self::LANES; - let row = { - // This is the inverse of the `index` function from the pack/unpack macros: - // fn index(row: usize, lane: usize) -> usize { - // let o = row / 8; - // let s = row % 8; - // (FL_ORDER[o] * 16) + (s * 128) + lane - // } - let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 - let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] - let o = FL_ORDER[fl_order]; // because this transposition is invertible! - o * 8 + s - }; - (lane, row) - } - - #[must_use] - fn mask() -> Self { - return if W == Self::T { - Self::max_value() - } else { - ((Self::one()) << (W % Self::T)) - Self::one() - }; - } - - /// Unpacks a single element at the provided `START_BIT` of the (runtime-specified) lane from a - /// packed array of 1024 `W` bit elements, where `W` is runtime-known (specified via the mask) - /// instead of compile-time known. The point of this function is to produce a reusable block of - /// code that balances compile-time optimization with code size. - /// - /// # Safety - /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W` - /// is the packed width. The output slice must be of exactly length 1024. - /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds). - unsafe fn unpack_single_const_helper( - packed: &[Self], lane: usize, mask: Self) -> Self - where - Pred< { START_BIT < Self::BITS_PER_LANE }> : Satisfied - { - let start_word = START_BIT / Self::T; - let lo_shift = START_BIT % Self::T; - let lo = packed[Self::LANES * start_word + lane] >> lo_shift; - if ONE_WORD { - lo & mask - } else { - let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false - let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift; - (lo | hi) & mask - } - } - /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements, /// where `W` is runtime-known instead of compile-time known. /// @@ -195,47 +130,13 @@ macro_rules! impl_packing { } /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. + #[inline(never)] fn unpack_single(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self where BitPackWidth: SupportedBitPackWidth, { - // Special case for W=0, since there's only one possible value. - if W == 0 { - return 0 as $T; - } - - // We can think of the input array as effectively a row-major, left-to-right - // 2-D array of with `Self::LANES` columns and `Self::T` rows. - // - // Meanwhile, we can think of the packed array as either: - // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns - // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns - // - // Bitpacking involves a transposition of the input array ordering, such that - // decompression can be fused efficiently with encodings like delta and RLE. - // - // First step, we need to get the lane and row for interpretation #1 above. - let (lane, row): (usize, usize) = seq!(I in 0..1024 { - match index { - #(I => - Self::packed_lane_and_row::(), - )* - _ => unreachable!("Unsupported index: {}", index) - } - }); - - let mask = Self::mask::(); - - seq_t!(ROW in $T { - match row { - #(ROW => { - const start_bit = ROW * W; - const remaining_bits: usize = $T::T - (start_bit % $T::T); - const one_word: bool = remaining_bits <= W; - return Self::unpack_single_const_helper::(packed, lane, mask); - },)* - _ => unreachable!("Unsupported row: {}", row) - } + unpack_single!($T, W, packed, index, |$elem| { + $elem }) } diff --git a/src/lib.rs b/src/lib.rs index fb18aaf..4bf858d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,7 +24,6 @@ pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7]; pub trait FastLanes: Sized + Unsigned + PrimInt { const T: usize = size_of::() * 8; const LANES: usize = 1024 / Self::T; - const BITS_PER_LANE: usize = Self::T * Self::T; } impl FastLanes for u8 {} diff --git a/src/macros.rs b/src/macros.rs index e725e42..0be7072 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -173,6 +173,87 @@ macro_rules! unpack { }; } +#[macro_export] +macro_rules! unpack_single { + ($T:ty, $W:expr, $packed:expr, $index:expr, | $_1:tt $elem:ident | $($body:tt)*) => { + macro_rules! __kernel__ {( $_1 $elem:ident ) => ( $($body)* )} + { + use $crate::{seq_t, FL_ORDER, FastLanes}; + use paste::paste; + + // The number of bits of T. + const T: usize = <$T>::T; + + // This calculation of (lane, row) is the inverse of the `index` function from the + // pack/unpack macros + #[inline(always)] + const fn lane_and_row() -> (usize, usize) { + const lane: usize = INDEX % <$T>::LANES; + const row: usize = { + let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 + let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] + let o = FL_ORDER[fl_order]; // because this transposition is invertible! + o * 8 + s + }; + (lane, row) + } + + fn unpack_single_const_helper( + packed: &[$T], lane: usize, mask: Self) -> Self + where + Pred< { START_BIT < T * T }> : Satisfied + { + let start_word = START_BIT / Self::T; + let lo_shift = START_BIT % Self::T; + let lo = packed[Self::LANES * start_word + lane] >> lo_shift; + if ONE_WORD { + lo & mask + } else { + let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false + let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift; + (lo | hi) & mask + } + } + + if $W == 0 { + // Special case for W=0, we just need to zero the output. + // We'll still respect the iteration order in case the kernel has side effects. + let zero: $T = 0; + __kernel__!(zero); + } else { + let (lane, row): (usize, usize) = seq!(I in 0..1024 { + match index { + #(I => + lane_and_row::(), + )* + _ => unreachable!("Unsupported index: {}", index) + } + }); + + // Special case for W=T, we can just copy the packed value directly to the output. + if $W == T { + let val = $packed[<$T>::LANES * row + lane]; + __kernel__!(val); + } else { + const mask: usize = (1 << ($W % T)) - 1; + paste!(seq_t!(ROW in $T { + match row { + #(ROW => { + const START_BIT: usize = ROW * $W; + const REMAINING_BITS: usize = T - (START_BIT % T); + const ONE_WORD: bool = REMAINING_BITS <= $W; + let val = unpack_single_const_helper::($packed, lane, mask); + __kernel__!(val); + },)* + _ => unreachable!("Unsupported row: {}", row) + } + })) + } + } + } + }; +} + #[cfg(test)] mod test { use crate::{BitPacking, FastLanes}; From 3950913e59588acdaefd0f3559952ebe0fcdb578 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 17 Jul 2024 14:31:22 +0100 Subject: [PATCH 07/16] fuck kernels --- src/bitpacking.rs | 4 +- src/macros.rs | 136 ++++++++++++++++++++++------------------------ 2 files changed, 67 insertions(+), 73 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index a06c267..cf8419a 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -135,9 +135,7 @@ macro_rules! impl_packing { where BitPackWidth: SupportedBitPackWidth, { - unpack_single!($T, W, packed, index, |$elem| { - $elem - }) + unpack_single!($T, W, packed, index); } unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self { diff --git a/src/macros.rs b/src/macros.rs index 0be7072..5b34cf0 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -175,83 +175,79 @@ macro_rules! unpack { #[macro_export] macro_rules! unpack_single { - ($T:ty, $W:expr, $packed:expr, $index:expr, | $_1:tt $elem:ident | $($body:tt)*) => { - macro_rules! __kernel__ {( $_1 $elem:ident ) => ( $($body)* )} - { - use $crate::{seq_t, FL_ORDER, FastLanes}; - use paste::paste; - - // The number of bits of T. - const T: usize = <$T>::T; + ($T:ty, $W:expr, $packed:expr, $index:expr) => { + use $crate::{FastLanes, FL_ORDER, seq_t}; + use seq_macro::seq; + use paste::paste; + + // The number of bits of T. + const T: usize = <$T>::T; + + // This calculation of (lane, row) is the inverse of the `index` function from the + // pack/unpack macros + #[inline(always)] + fn lane_and_row() -> (usize, usize) + where Pred< { INDEX < 1024 }> : Satisfied { + const lane: usize = INDEX % <$T>::LANES; + const row: usize = { + let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 + let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] + let o = FL_ORDER[fl_order]; // because this transposition is invertible! + o * 8 + s + }; + (lane, row) + } - // This calculation of (lane, row) is the inverse of the `index` function from the - // pack/unpack macros - #[inline(always)] - const fn lane_and_row() -> (usize, usize) { - const lane: usize = INDEX % <$T>::LANES; - const row: usize = { - let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 - let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] - let o = FL_ORDER[fl_order]; // because this transposition is invertible! - o * 8 + s - }; - (lane, row) + fn unpack_single_const( + packed: &[$T], lane: usize, mask: $T) -> $T + where + Pred< { START_BIT < T * T }> : Satisfied + { + let start_word = START_BIT / T; + let lo_shift = START_BIT % T; + let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift; + if ONE_WORD { + lo & mask + } else { + let hi_shift = T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false + let hi = packed[<$T>::LANES * (start_word + 1) + lane] << hi_shift; + (lo | hi) & mask } + } - fn unpack_single_const_helper( - packed: &[$T], lane: usize, mask: Self) -> Self - where - Pred< { START_BIT < T * T }> : Satisfied - { - let start_word = START_BIT / Self::T; - let lo_shift = START_BIT % Self::T; - let lo = packed[Self::LANES * start_word + lane] >> lo_shift; - if ONE_WORD { - lo & mask - } else { - let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false - let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift; - (lo | hi) & mask + if $W == 0 { + // Special case for W=0, we just need to zero the output. + // We'll still respect the iteration order in case the kernel has side effects. + return 0 as $T; + } + + let (lane, row): (usize, usize) = seq!(I in 0..1024 { + match $index { + #(I => + lane_and_row::(), + )* + _ => unreachable!("Unsupported index: {}", $index) } - } + }); - if $W == 0 { - // Special case for W=0, we just need to zero the output. - // We'll still respect the iteration order in case the kernel has side effects. - let zero: $T = 0; - __kernel__!(zero); - } else { - let (lane, row): (usize, usize) = seq!(I in 0..1024 { - match index { - #(I => - lane_and_row::(), - )* - _ => unreachable!("Unsupported index: {}", index) - } - }); + // Special case for W=T, we can just copy the packed value directly to the output. + if $W == T { + return $packed[<$T>::LANES * row + lane]; + } - // Special case for W=T, we can just copy the packed value directly to the output. - if $W == T { - let val = $packed[<$T>::LANES * row + lane]; - __kernel__!(val); - } else { - const mask: usize = (1 << ($W % T)) - 1; - paste!(seq_t!(ROW in $T { - match row { - #(ROW => { - const START_BIT: usize = ROW * $W; - const REMAINING_BITS: usize = T - (START_BIT % T); - const ONE_WORD: bool = REMAINING_BITS <= $W; - let val = unpack_single_const_helper::($packed, lane, mask); - __kernel__!(val); - },)* - _ => unreachable!("Unsupported row: {}", row) - } - })) - } + const mask: $T = (1 << ($W % T)) - 1; + paste!(seq_t!(ROW in $T { + match row { + #(ROW => { + const START_BIT: usize = ROW * $W; + const REMAINING_BITS: usize = T - (START_BIT % T); + const ONE_WORD: bool = REMAINING_BITS <= $W; + return unpack_single_const::<{START_BIT}, {ONE_WORD}>($packed, lane, mask); + },)* + _ => unreachable!("Unsupported row: {}", row) } - } - }; + })) + } } #[cfg(test)] From 9ea034e24d8d3e0c29a6332be6acc662f8dae9ad Mon Sep 17 00:00:00 2001 From: Will Manning Date: Wed, 17 Jul 2024 14:53:58 +0100 Subject: [PATCH 08/16] this is dumb and I hate it --- src/macros.rs | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src/macros.rs b/src/macros.rs index 5b34cf0..b3d8376 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -183,22 +183,7 @@ macro_rules! unpack_single { // The number of bits of T. const T: usize = <$T>::T; - // This calculation of (lane, row) is the inverse of the `index` function from the - // pack/unpack macros - #[inline(always)] - fn lane_and_row() -> (usize, usize) - where Pred< { INDEX < 1024 }> : Satisfied { - const lane: usize = INDEX % <$T>::LANES; - const row: usize = { - let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 - let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] - let o = FL_ORDER[fl_order]; // because this transposition is invertible! - o * 8 + s - }; - (lane, row) - } - - fn unpack_single_const( + fn unpack_single_helper( packed: &[$T], lane: usize, mask: $T) -> $T where Pred< { START_BIT < T * T }> : Satisfied @@ -221,11 +206,20 @@ macro_rules! unpack_single { return 0 as $T; } - let (lane, row): (usize, usize) = seq!(I in 0..1024 { + let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 { match $index { - #(I => - lane_and_row::(), - )* + #(INDEX => { + // This calculation of (lane, row) is the inverse of the `index` function from the + // pack/unpack macros + const lane: usize = INDEX % <$T>::LANES; + const row: usize = { + let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 + let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] + let o = FL_ORDER[fl_order]; // because this transposition is invertible! + o * 8 + s + }; + (lane, row) + })* _ => unreachable!("Unsupported index: {}", $index) } }); @@ -242,7 +236,7 @@ macro_rules! unpack_single { const START_BIT: usize = ROW * $W; const REMAINING_BITS: usize = T - (START_BIT % T); const ONE_WORD: bool = REMAINING_BITS <= $W; - return unpack_single_const::<{START_BIT}, {ONE_WORD}>($packed, lane, mask); + return unpack_single_helper::<{START_BIT}, {ONE_WORD}>($packed, lane, mask); },)* _ => unreachable!("Unsupported row: {}", row) } From 8a1ce5b16f130537170e34049fdff55814131571 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 18 Jul 2024 13:48:35 +0100 Subject: [PATCH 09/16] everything const, compiles and runs --- src/bitpacking.rs | 82 ++++++++++++++++++++++++++++++++++++----------- src/macros.rs | 74 ++++++++++++++---------------------------- 2 files changed, 88 insertions(+), 68 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index cf8419a..0d050f4 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -1,8 +1,9 @@ use arrayref::{array_mut_ref, array_ref}; use core::mem::size_of; use paste::paste; +use seq_macro::seq; -use crate::{pack, seq_t, unpack, unpack_single, FastLanes, Pred, Satisfied}; +use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; pub struct BitPackWidth; pub trait SupportedBitPackWidth {} @@ -130,33 +131,78 @@ macro_rules! impl_packing { } /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements. - #[inline(never)] fn unpack_single(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self where BitPackWidth: SupportedBitPackWidth, { - unpack_single!($T, W, packed, index); + unsafe { + Self::unchecked_unpack_single(W, packed, index) + } } - unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self { + #[allow(arithmetic_overflow, unused_comparisons)] + unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self { let packed_len = 128 * width / size_of::(); - debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len); + debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len); debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T); - debug_assert!(index <= 1024, "index must be less than or equal to 1024"); + debug_assert!(index < 1024, "index must be less than or equal to 1024"); + + let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 { + match index { + #(INDEX => { + // This calculation of (lane, row) is the inverse of the `index` function from the + // pack/unpack macros + const LANE: usize = INDEX % <$T>::LANES; + const ROW: usize = { + let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 + let fl_order = (INDEX - s * 128 - LANE) / 16; // value of FL_ORDER[o] + let o = FL_ORDER[fl_order]; // because this transposition is invertible! + o * 8 + s + }; + (LANE, ROW) + })* + _ => unreachable!("Unsupported index: {}", index) + } + }); seq_t!(W in $T { match width { #(W => { - Self::unpack_single::( - array_ref![input, 0, 1024 * W / <$T>::T], - index - ) - })* + if W == 0 { + // Special case for W=0, we just need to zero the output. + return 0 as $T; + } + seq_t!(ROW in $T { + match row { + #(ROW => { + const MASK: $T = (1 << (W % <$T>::T)) - 1; + const START_BIT: usize = ROW * W; + + const START_WORD: usize = START_BIT / <$T>::T; + // bits to shift out of lo word + const LO_SHIFT: usize = START_BIT % <$T>::T; + // remaining bits in the lo word == bits to shift from hi word + const REMAINING_BITS: usize = <$T>::T - LO_SHIFT; + + let lo = packed[<$T>::LANES * START_WORD + lane] >> LO_SHIFT; + return if REMAINING_BITS >= W { + // in this case we will mask out all bits of hi word + lo & MASK + } else { + // guaranteed that lo_shift > 0 and thus remaining_bits < T + let hi = packed[<$T>::LANES * (START_WORD + 1) + lane] << REMAINING_BITS; + (lo | hi) & MASK + } + },)* + _ => unreachable!("Unsupported row: {}", row) + } + }) + },)* // seq_t has exclusive upper bound - Self::T => Self::unpack_single::<{ Self::T }>( - array_ref![input, 0, 1024], - index - ), + Self::T => { + // Special case for W=T, we can just read the value directly + return packed[<$T>::LANES * row + lane]; + }, _ => unreachable!("Unsupported width: {}", width) } }) @@ -166,10 +212,10 @@ macro_rules! impl_packing { }; } -//impl_packing!(u8); -//impl_packing!(u16); +impl_packing!(u8); +impl_packing!(u16); impl_packing!(u32); -// impl_packing!(u64); +impl_packing!(u64); #[cfg(test)] mod test { diff --git a/src/macros.rs b/src/macros.rs index b3d8376..5bf02c4 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -175,73 +175,47 @@ macro_rules! unpack { #[macro_export] macro_rules! unpack_single { - ($T:ty, $W:expr, $packed:expr, $index:expr) => { - use $crate::{FastLanes, FL_ORDER, seq_t}; - use seq_macro::seq; + // $W must be constant / compile-time known + ($T:ty, $W:expr, $packed:expr, $row:expr, $lane:expr) => {{ + use $crate::{FastLanes, seq_t}; use paste::paste; // The number of bits of T. const T: usize = <$T>::T; - fn unpack_single_helper( - packed: &[$T], lane: usize, mask: $T) -> $T - where - Pred< { START_BIT < T * T }> : Satisfied - { - let start_word = START_BIT / T; - let lo_shift = START_BIT % T; - let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift; - if ONE_WORD { - lo & mask - } else { - let hi_shift = T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false - let hi = packed[<$T>::LANES * (start_word + 1) + lane] << hi_shift; - (lo | hi) & mask - } - } - if $W == 0 { // Special case for W=0, we just need to zero the output. - // We'll still respect the iteration order in case the kernel has side effects. return 0 as $T; + } else if $W == T { + return $packed[<$T>::LANES * $row + $lane]; } - let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 { - match $index { - #(INDEX => { - // This calculation of (lane, row) is the inverse of the `index` function from the - // pack/unpack macros - const lane: usize = INDEX % <$T>::LANES; - const row: usize = { - let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 - let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o] - let o = FL_ORDER[fl_order]; // because this transposition is invertible! - o * 8 + s - }; - (lane, row) - })* - _ => unreachable!("Unsupported index: {}", $index) - } - }); - - // Special case for W=T, we can just copy the packed value directly to the output. - if $W == T { - return $packed[<$T>::LANES * row + lane]; - } - - const mask: $T = (1 << ($W % T)) - 1; paste!(seq_t!(ROW in $T { - match row { + match $row { #(ROW => { + const MASK: $T = (1 << ($W % T)) - 1; const START_BIT: usize = ROW * $W; - const REMAINING_BITS: usize = T - (START_BIT % T); - const ONE_WORD: bool = REMAINING_BITS <= $W; - return unpack_single_helper::<{START_BIT}, {ONE_WORD}>($packed, lane, mask); + + const START_WORD: usize = START_BIT / T; + // bits to shift out of lo word + const LO_SHIFT: usize = START_BIT % T; + // remaining bits in the lo word == bits to shift from hi word + const REMAINING_BITS: usize = T - LO_SHIFT; + + let lo = packed[<$T>::LANES * START_WORD + $lane] >> LO_SHIFT; + return if REMAINING_BITS >= W { + // in this case we will mask out all bits of hi word + lo & MASK + } else { + // guaranteed that lo_shift > 0 and thus remaining_bits < T + let hi = packed[<$T>::LANES * (START_WORD + 1) + $lane] << REMAINING_BITS; + (lo | hi) & MASK + } },)* _ => unreachable!("Unsupported row: {}", row) } })) - } + }} } #[cfg(test)] From 7432bcff76dd9849f05f4d79eeae781a42120839 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 18 Jul 2024 14:22:07 +0100 Subject: [PATCH 10/16] faster smaller code --- src/bitpacking.rs | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 0d050f4..f7b60e2 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -172,31 +172,22 @@ macro_rules! impl_packing { // Special case for W=0, we just need to zero the output. return 0 as $T; } - seq_t!(ROW in $T { - match row { - #(ROW => { - const MASK: $T = (1 << (W % <$T>::T)) - 1; - const START_BIT: usize = ROW * W; - - const START_WORD: usize = START_BIT / <$T>::T; - // bits to shift out of lo word - const LO_SHIFT: usize = START_BIT % <$T>::T; - // remaining bits in the lo word == bits to shift from hi word - const REMAINING_BITS: usize = <$T>::T - LO_SHIFT; - - let lo = packed[<$T>::LANES * START_WORD + lane] >> LO_SHIFT; - return if REMAINING_BITS >= W { - // in this case we will mask out all bits of hi word - lo & MASK - } else { - // guaranteed that lo_shift > 0 and thus remaining_bits < T - let hi = packed[<$T>::LANES * (START_WORD + 1) + lane] << REMAINING_BITS; - (lo | hi) & MASK - } - },)* - _ => unreachable!("Unsupported row: {}", row) - } - }) + + const MASK: $T = (1 << (W % <$T>::T)) - 1; + let start_bit = row * W; + let start_word = start_bit / <$T>::T; + let lo_shift = start_bit % <$T>::T; + let remaining_bits = <$T>::T - lo_shift; + + let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift; + return if remaining_bits >= W { + // in this case we will mask out all bits of hi word + lo & MASK + } else { + // guaranteed that lo_shift > 0 and thus remaining_bits < T + let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits; + (lo | hi) & MASK + } },)* // seq_t has exclusive upper bound Self::T => { From 6cd5629257af6da14265f0608b84c7f71d227991 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 18 Jul 2024 14:40:25 +0100 Subject: [PATCH 11/16] faster, less const magic --- src/bitpacking.rs | 69 ++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index f7b60e2..935f469 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -135,17 +135,10 @@ macro_rules! impl_packing { where BitPackWidth: SupportedBitPackWidth, { - unsafe { - Self::unchecked_unpack_single(W, packed, index) + if W == 0 { + // Special case for W=0, we just need to zero the output. + return 0 as $T; } - } - - #[allow(arithmetic_overflow, unused_comparisons)] - unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self { - let packed_len = 128 * width / size_of::(); - debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len); - debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T); - debug_assert!(index < 1024, "index must be less than or equal to 1024"); let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 { match index { @@ -165,34 +158,44 @@ macro_rules! impl_packing { } }); + if W == Self::T { + // Special case for W==T, we can just read the value directly + return packed[<$T>::LANES * row + lane]; + } + + let mask: $T = (1 << (W % <$T>::T)) - 1; + let start_bit = row * W; + let start_word = start_bit / <$T>::T; + let lo_shift = start_bit % <$T>::T; + let remaining_bits = <$T>::T - lo_shift; + + let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift; + return if remaining_bits >= W { + // in this case we will mask out all bits of hi word + lo & mask + } else { + // guaranteed that lo_shift > 0 and thus remaining_bits < T + let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits; + (lo | hi) & mask + } + } + + unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self { + let packed_len = 128 * width / size_of::(); + debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len); + debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T); + debug_assert!(index < 1024, "index must be less than or equal to 1024"); + + const T: usize = <$T>::T; + seq_t!(W in $T { match width { #(W => { - if W == 0 { - // Special case for W=0, we just need to zero the output. - return 0 as $T; - } - - const MASK: $T = (1 << (W % <$T>::T)) - 1; - let start_bit = row * W; - let start_word = start_bit / <$T>::T; - let lo_shift = start_bit % <$T>::T; - let remaining_bits = <$T>::T - lo_shift; - - let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift; - return if remaining_bits >= W { - // in this case we will mask out all bits of hi word - lo & MASK - } else { - // guaranteed that lo_shift > 0 and thus remaining_bits < T - let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits; - (lo | hi) & MASK - } + return <$T>::unpack_single::(array_ref![packed, 0, 1024 * W / T], index); },)* // seq_t has exclusive upper bound - Self::T => { - // Special case for W=T, we can just read the value directly - return packed[<$T>::LANES * row + lane]; + T => { + return <$T>::unpack_single::(array_ref![packed, 0, 1024], index); }, _ => unreachable!("Unsupported width: {}", width) } From d46e9f0925d4bd1d68b5f8770cdd73a1422fd711 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 19 Jul 2024 13:31:29 +0100 Subject: [PATCH 12/16] sub nanosecond, woot --- Cargo.toml | 1 + src/bitpacking.rs | 53 +++++++++++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5e8472b..01a16b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ edition = "2021" [dependencies] arrayref = "0.3.7" +const_for = "0.1.4" num-traits = "0.2.19" paste = "1.0.15" seq-macro = "0.3.5" diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 935f469..73972d3 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -1,7 +1,7 @@ use arrayref::{array_mut_ref, array_ref}; use core::mem::size_of; use paste::paste; -use seq_macro::seq; +use const_for::const_for; use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; @@ -140,25 +140,14 @@ macro_rules! impl_packing { return 0 as $T; } - let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 { - match index { - #(INDEX => { - // This calculation of (lane, row) is the inverse of the `index` function from the - // pack/unpack macros - const LANE: usize = INDEX % <$T>::LANES; - const ROW: usize = { - let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 - let fl_order = (INDEX - s * 128 - LANE) / 16; // value of FL_ORDER[o] - let o = FL_ORDER[fl_order]; // because this transposition is invertible! - o * 8 + s - }; - (LANE, ROW) - })* - _ => unreachable!("Unsupported index: {}", index) - } - }); + assert!(index < 1024, "Index must be less than 1024, got {}", index); + let (lane, row): (usize, usize) = { + const LANES: [u8; 1024] = lanes_by_index::<$T>(); + const ROWS: [u8; 1024] = rows_by_index::<$T>(); + (LANES[index] as usize, ROWS[index] as usize) + }; - if W == Self::T { + if W == <$T>::T { // Special case for W==T, we can just read the value directly return packed[<$T>::LANES * row + lane]; } @@ -177,17 +166,17 @@ macro_rules! impl_packing { // guaranteed that lo_shift > 0 and thus remaining_bits < T let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits; (lo | hi) & mask - } + }; } unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self { + const T: usize = <$T>::T; + let packed_len = 128 * width / size_of::(); debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len); debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T); debug_assert!(index < 1024, "index must be less than or equal to 1024"); - const T: usize = <$T>::T; - seq_t!(W in $T { match width { #(W => { @@ -206,6 +195,26 @@ macro_rules! impl_packing { }; } +const fn lanes_by_index() -> [u8; 1024] { + let mut lanes = [0u8; 1024]; + const_for!(i in 0..1024 => { + lanes[i] = (i % T::LANES) as u8; + }); + lanes +} + +const fn rows_by_index() -> [u8; 1024] { + let mut rows = [0u8; 1024]; + const_for!(i in 0..1024 => { + let lane = i % T::LANES; + let s = i / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 + let fl_order = (i - s * 128 - lane) / 16; // value of FL_ORDER[o] + let o = FL_ORDER[fl_order]; // because this transposition is invertible! + rows[i] = (o * 8 + s) as u8; + }); + rows +} + impl_packing!(u8); impl_packing!(u16); impl_packing!(u32); From d63a0e7ac8f848b6b374740d728847602c5254bc Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 19 Jul 2024 14:05:02 +0100 Subject: [PATCH 13/16] remove vestigial --- src/macros.rs | 45 --------------------------------------------- 1 file changed, 45 deletions(-) diff --git a/src/macros.rs b/src/macros.rs index 5bf02c4..e725e42 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -173,51 +173,6 @@ macro_rules! unpack { }; } -#[macro_export] -macro_rules! unpack_single { - // $W must be constant / compile-time known - ($T:ty, $W:expr, $packed:expr, $row:expr, $lane:expr) => {{ - use $crate::{FastLanes, seq_t}; - use paste::paste; - - // The number of bits of T. - const T: usize = <$T>::T; - - if $W == 0 { - // Special case for W=0, we just need to zero the output. - return 0 as $T; - } else if $W == T { - return $packed[<$T>::LANES * $row + $lane]; - } - - paste!(seq_t!(ROW in $T { - match $row { - #(ROW => { - const MASK: $T = (1 << ($W % T)) - 1; - const START_BIT: usize = ROW * $W; - - const START_WORD: usize = START_BIT / T; - // bits to shift out of lo word - const LO_SHIFT: usize = START_BIT % T; - // remaining bits in the lo word == bits to shift from hi word - const REMAINING_BITS: usize = T - LO_SHIFT; - - let lo = packed[<$T>::LANES * START_WORD + $lane] >> LO_SHIFT; - return if REMAINING_BITS >= W { - // in this case we will mask out all bits of hi word - lo & MASK - } else { - // guaranteed that lo_shift > 0 and thus remaining_bits < T - let hi = packed[<$T>::LANES * (START_WORD + 1) + $lane] << REMAINING_BITS; - (lo | hi) & MASK - } - },)* - _ => unreachable!("Unsupported row: {}", row) - } - })) - }} -} - #[cfg(test)] mod test { use crate::{BitPacking, FastLanes}; From b3d54962b0cc81edb20178a1079e62a69d003b24 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 19 Jul 2024 14:07:33 +0100 Subject: [PATCH 14/16] comment --- src/bitpacking.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 73972d3..f382d3b 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -140,6 +140,17 @@ macro_rules! impl_packing { return 0 as $T; } + // We can think of the input array as effectively a row-major, left-to-right + // 2-D array of with `Self::LANES` columns and `Self::T` rows. + // + // Meanwhile, we can think of the packed array as either: + // 1. `Self::T` rows of W-bit elements, with `Self::LANES` columns + // 2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns + // + // Bitpacking involves a transposition of the input array ordering, such that + // decompression can be fused efficiently with encodings like delta and RLE. + // + // First step, we need to get the lane and row for interpretation #1 above. assert!(index < 1024, "Index must be less than 1024, got {}", index); let (lane, row): (usize, usize) = { const LANES: [u8; 1024] = lanes_by_index::<$T>(); @@ -195,6 +206,7 @@ macro_rules! impl_packing { }; } +// helper function executed at compile-time to speed up unpack_single at runtime const fn lanes_by_index() -> [u8; 1024] { let mut lanes = [0u8; 1024]; const_for!(i in 0..1024 => { @@ -203,9 +215,16 @@ const fn lanes_by_index() -> [u8; 1024] { lanes } +// helper function executed at compile-time to speed up unpack_single at runtime const fn rows_by_index() -> [u8; 1024] { let mut rows = [0u8; 1024]; const_for!(i in 0..1024 => { + // This is the inverse of the `index` function from the pack/unpack macros: + // fn index(row: usize, lane: usize) -> usize { + // let o = row / 8; + // let s = row % 8; + // (FL_ORDER[o] * 16) + (s * 128) + lane + // } let lane = i % T::LANES; let s = i / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128 let fl_order = (i - s * 128 - lane) / 16; // value of FL_ORDER[o] From 9363cdbe93e7ad47715b77300d28c4ea98a2450a Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 19 Jul 2024 14:20:58 +0100 Subject: [PATCH 15/16] remove unnecessary assert --- src/bitpacking.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index f382d3b..5dfe9ab 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -1,7 +1,7 @@ use arrayref::{array_mut_ref, array_ref}; +use const_for::const_for; use core::mem::size_of; use paste::paste; -use const_for::const_for; use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER}; @@ -186,7 +186,6 @@ macro_rules! impl_packing { let packed_len = 128 * width / size_of::(); debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len); debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T); - debug_assert!(index < 1024, "index must be less than or equal to 1024"); seq_t!(W in $T { match width { From f819899eae1e046a77fab3c91d03f6e9ca179b21 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Fri, 19 Jul 2024 15:00:18 +0100 Subject: [PATCH 16/16] Remove inline never --- src/bitpacking.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/bitpacking.rs b/src/bitpacking.rs index 5dfe9ab..8d0ca10 100644 --- a/src/bitpacking.rs +++ b/src/bitpacking.rs @@ -62,7 +62,6 @@ macro_rules! impl_packing { ($T:ty) => { paste! { impl BitPacking for $T { - #[inline(never)] // Makes it easier to disassemble and validate ASM. fn pack( input: &[Self; 1024], output: &mut [Self; 1024 * W / Self::T], @@ -96,7 +95,6 @@ macro_rules! impl_packing { }) } - #[inline(never)] fn unpack( input: &[Self; 1024 * W / Self::T], output: &mut [Self; 1024],