From f3a39c670a00ef2ef22d19c9b023fd53996197fe Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Wed, 3 Jul 2024 12:02:58 -0400
Subject: [PATCH 01/16] wip on gcd const

---
 benches/bitpacking.rs | 18 +++++++-
 src/bitpacking.rs     | 95 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)
diff --git a/benches/bitpacking.rs b/benches/bitpacking.rs
index f29402d..b1f425e 100644
--- a/benches/bitpacking.rs
+++ b/benches/bitpacking.rs
@@ -46,7 +46,7 @@ fn pack(c: &mut Criterion) {
 
     {
         let mut group = c.benchmark_group("unpack-single");
-        group.bench_function("unpack single 16 <- 3", |b| {
+        group.bench_function("unpack_single 16 <- 3", |b| {
             const WIDTH: usize = 3;
             let values = vec![3u16; 1024];
             let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];
@@ -61,6 +61,22 @@ fn pack(c: &mut Criterion) {
                 }
             });
         });
+
+        group.bench_function("unpack_single2 16 <- 3", |b| {
+            const WIDTH: usize = 3;
+            let values = vec![3u16; 1024];
+            let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];
+            BitPacking::pack::<WIDTH>(array_ref![values, 0, 1024], array_mut_ref![packed, 0, 192]);
+
+            b.iter(|| {
+                for i in 0..1024 {
+                    black_box::<u16>(BitPacking::unpack_single2::<WIDTH>(
+                        array_ref![packed, 0, 192],
+                        i,
+                    ));
+                }
+            });
+        });
     }
 }
 
diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 89c2e32..5ebad92 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -99,6 +99,101 @@ pub trait BitPacking: FastLanes {
         (lo | hi) & mask
     }
 
+    /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
+    fn unpack_single2<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
+    where
+        BitPackWidth<W>: SupportedBitPackWidth<Self>,
+    {
+        // Special case for W=0, since there's only one possible value.
+        if W == 0 {
+            return Self::zero();
+        }
+
+        // We can think of the input array as effectively a row-major, left-to-right
+        // 2-D array of with `Self::LANES` columns and `Self::T` rows.
+        //
+        // Meanwhile, we can think of the packed array as either:
+        //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
+        //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
+        //
+        // Bitpacking involves a transposition of the input array ordering, such that
+        // decompression can be fused efficiently with encodings like delta and RLE.
+        //
+        // First step, we need to get the lane and row for interpretation #1 above.
+        let (lane, row): (usize, usize) = seq!(I in 0..1024 {
+            match index {
+                #(I =>
+                    Self::packed_lane_and_row::<I>(),
+                )*
+                _ => unreachable!("Unsupported index: {}", index)
+            }
+        });
+
+        // From the row, we can get the correct start bit within the lane.
+        let start_bit = row * W;
+        let start_word = (start_bit) / Self::T;
+        let end_word = (start_bit + W - 1) / Self::T;
+        let one_word = (start_word == end_word);
+        let mask: Self = if W == Self::T {
+            Self::max_value()
+        } else {
+            ((Self::one()) << (W % Self::T)) - Self::one()
+        };
+
+
+        unsafe { Self::unpack_single_const_helper::<lane, start_bit, one_word>(packed, mask) }
+    }
+
+    fn packed_lane_and_row<const INDEX: usize>() -> (usize, usize) {
+        // We can think of the input array as effectively a row-major, left-to-right
+        // 2-D array of with `Self::LANES` columns and `Self::T` rows.
+        //
+        // Meanwhile, we can think of the packed array as either:
+        //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
+        //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
+        //
+        // Bitpacking involves a transposition of the input array ordering, such that
+        // decompression can be fused efficiently with encodings like delta and RLE.
+        //
+        // First step, we need to get the lane and row for interpretation #1 above.
+        let lane = INDEX % Self::LANES;
+        let row = {
+            // This is the inverse of the `index` function from the pack/unpack macros:
+            //     fn index(row: usize, lane: usize) -> usize {
+            //         let o = row / 8;
+            //         let s = row % 8;
+            //         (FL_ORDER[o] * 16) + (s * 128) + lane
+            //     }
+            let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
+            let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
+            let o = FL_ORDER[fl_order]; // because this transposition is invertible!
+            o * 8 + s
+        };
+        (lane, row)
+    }
+
+    /// Unpacks a single element at the provided LANE and START_BIT from a packed array of 1024 `W` bit elements,
+    /// where `W` is runtime-known instead of compile-time known.
+    ///
+    /// # Safety
+    /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W`
+    /// is the packed width. The output slice must be of exactly length 1024.
+    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
+    unsafe fn unpack_single_const_helper<const LANE: usize, const START_BIT: usize, const ONE_WORD: bool>(
+        packed: &[Self], mask: Self) -> Self
+    {
+        let start_word = START_BIT / Self::T;
+        let lo_shift = START_BIT % Self::T;
+        let lo = packed[Self::LANES * start_word + LANE] >> lo_shift;
+        if ONE_WORD {
+            lo & mask
+        } else {
+            let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false
+            let hi = packed[Self::LANES * (start_word + 1) + LANE] << hi_shift;
+            (lo | hi) & mask
+        }
+    }
+
     /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements,
     /// where `W` is runtime-known instead of compile-time known.
     ///

From aac0ddc65358f1ed9fa4d526574db747aca480f6 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Thu, 4 Jul 2024 15:31:21 -0400
Subject: [PATCH 02/16] wip

---
 src/bitpacking.rs | 121 +++++++++++++++++++++++++++++-----------------
 src/lib.rs        |  13 +++++
 2 files changed, 89 insertions(+), 45 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 5ebad92..f4ea35d 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -1,8 +1,9 @@
 use arrayref::{array_mut_ref, array_ref};
 use core::mem::size_of;
 use paste::paste;
+use seq_macro::seq;
 
-use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
+use crate::{pack, seq_t, seq_start_bits, seq_lanes, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
 
 pub struct BitPackWidth<const W: usize>;
 pub trait SupportedBitPackWidth<T> {}
@@ -102,47 +103,7 @@ pub trait BitPacking: FastLanes {
     /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
     fn unpack_single2<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
     where
-        BitPackWidth<W>: SupportedBitPackWidth<Self>,
-    {
-        // Special case for W=0, since there's only one possible value.
-        if W == 0 {
-            return Self::zero();
-        }
-
-        // We can think of the input array as effectively a row-major, left-to-right
-        // 2-D array of with `Self::LANES` columns and `Self::T` rows.
-        //
-        // Meanwhile, we can think of the packed array as either:
-        //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
-        //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
-        //
-        // Bitpacking involves a transposition of the input array ordering, such that
-        // decompression can be fused efficiently with encodings like delta and RLE.
-        //
-        // First step, we need to get the lane and row for interpretation #1 above.
-        let (lane, row): (usize, usize) = seq!(I in 0..1024 {
-            match index {
-                #(I =>
-                    Self::packed_lane_and_row::<I>(),
-                )*
-                _ => unreachable!("Unsupported index: {}", index)
-            }
-        });
-
-        // From the row, we can get the correct start bit within the lane.
-        let start_bit = row * W;
-        let start_word = (start_bit) / Self::T;
-        let end_word = (start_bit + W - 1) / Self::T;
-        let one_word = (start_word == end_word);
-        let mask: Self = if W == Self::T {
-            Self::max_value()
-        } else {
-            ((Self::one()) << (W % Self::T)) - Self::one()
-        };
-
-
-        unsafe { Self::unpack_single_const_helper::<lane, start_bit, one_word>(packed, mask) }
-    }
+        BitPackWidth<W>: SupportedBitPackWidth<Self>;
 
     fn packed_lane_and_row<const INDEX: usize>() -> (usize, usize) {
         // We can think of the input array as effectively a row-major, left-to-right
@@ -181,6 +142,9 @@ pub trait BitPacking: FastLanes {
     /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
     unsafe fn unpack_single_const_helper<const LANE: usize, const START_BIT: usize, const ONE_WORD: bool>(
         packed: &[Self], mask: Self) -> Self
+    where
+        Pred< { START_BIT < Self::T * Self::T }> : Satisfied,
+        Pred< { LANE < Self::LANES }> : Satisfied
     {
         let start_word = START_BIT / Self::T;
         let lo_shift = START_BIT % Self::T;
@@ -276,6 +240,73 @@ macro_rules! impl_packing {
                     })
                 }
 
+                /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
+                fn unpack_single2<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
+                where
+                    BitPackWidth<W>: SupportedBitPackWidth<Self>,
+                {
+                    // Special case for W=0, since there's only one possible value.
+                    if W == 0 {
+                        return 0 as $T;
+                    }
+
+                    // We can think of the input array as effectively a row-major, left-to-right
+                    // 2-D array of with `Self::LANES` columns and `Self::T` rows.
+                    //
+                    // Meanwhile, we can think of the packed array as either:
+                    //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
+                    //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
+                    //
+                    // Bitpacking involves a transposition of the input array ordering, such that
+                    // decompression can be fused efficiently with encodings like delta and RLE.
+                    //
+                    // First step, we need to get the lane and row for interpretation #1 above.
+                    let (lane, row): (usize, usize) = seq!(I in 0..1024 {
+                        match index {
+                            #(I =>
+                                Self::packed_lane_and_row::<I>(),
+                            )*
+                            _ => unreachable!("Unsupported index: {}", index)
+                        }
+                    });
+
+                    // From the row, we can get the correct start bit within the lane.
+                    let start_bit = row * W;
+                    let start_word = (start_bit) / Self::T;
+                    let end_word = (start_bit + W - 1) / Self::T;
+                    let one_word = (start_word == end_word);
+
+                    #[inline]
+                    fn mask(width: usize) -> $T {
+                        if width == $T::T { <$T>::MAX } else { (1 << (width % $T::T)) - 1 }
+                    }
+                    
+                    seq_t!(W, ROW in $T, |W| {
+                        match start_bit {
+                            #(ROW * W => {
+                                seq_lanes!(LANE in $T {
+                                    match lane {
+                                        #(LANE => {
+                                            match one_word {
+                                                true => unsafe { $T::unpack_single_const_helper::<LANE, START_BIT, {true}>(packed, mask(W)) },
+                                                false => unsafe { $T::unpack_single_const_helper::<LANE, START_BIT, {false}>(packed, mask(W)) },
+                                            }
+                                        })*
+                                        _ => unreachable!(
+                                            "Unsupported lane: {}",
+                                            lane
+                                        )
+                                    }
+                                })
+                            })*
+                            _ => unreachable!(
+                                "Unsupported start_bit: {}",
+                                start_bit
+                            )
+                        }
+                    })
+                }
+
                 unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self {
                     let packed_len = 128 * width / size_of::<Self>();
                     debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len);
@@ -304,10 +335,10 @@ macro_rules! impl_packing {
     };
 }
 
-impl_packing!(u8);
+//impl_packing!(u8);
 impl_packing!(u16);
-impl_packing!(u32);
-impl_packing!(u64);
+//impl_packing!(u32);
+//impl_packing!(u64);
 
 #[cfg(test)]
 mod test {
diff --git a/src/lib.rs b/src/lib.rs
index 4bf858d..bf5f689 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,6 +44,19 @@ macro_rules! seq_t {
     ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
     ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
     ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
+    ($W:expr, $ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)};
+    ($W:expr, $ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
+    ($W:expr, $ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
+    ($W:expr, $ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
+}
+
+// Macro for repeating a code block T::LANES times.
+#[macro_export]
+macro_rules! seq_lanes {
+    ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..128 $body)};
+    ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
+    ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
+    ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
 }
 
 #[cfg(test)]

From 30fda6dc5af3315b3e5c786acb8a86fae46ea03e Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Thu, 4 Jul 2024 15:33:53 -0400
Subject: [PATCH 03/16] minimal improvement

---
 benches/bitpacking.rs |  18 +-------
 src/bitpacking.rs     | 101 +++++-------------------------------------
 src/lib.rs            |  13 ------
 3 files changed, 13 insertions(+), 119 deletions(-)

diff --git a/benches/bitpacking.rs b/benches/bitpacking.rs
index b1f425e..f29402d 100644
--- a/benches/bitpacking.rs
+++ b/benches/bitpacking.rs
@@ -46,7 +46,7 @@ fn pack(c: &mut Criterion) {
 
     {
         let mut group = c.benchmark_group("unpack-single");
-        group.bench_function("unpack_single 16 <- 3", |b| {
+        group.bench_function("unpack single 16 <- 3", |b| {
             const WIDTH: usize = 3;
             let values = vec![3u16; 1024];
             let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];
@@ -61,22 +61,6 @@ fn pack(c: &mut Criterion) {
                 }
             });
         });
-
-        group.bench_function("unpack_single2 16 <- 3", |b| {
-            const WIDTH: usize = 3;
-            let values = vec![3u16; 1024];
-            let mut packed = vec![0; 128 * WIDTH / size_of::<u16>()];
-            BitPacking::pack::<WIDTH>(array_ref![values, 0, 1024], array_mut_ref![packed, 0, 192]);
-
-            b.iter(|| {
-                for i in 0..1024 {
-                    black_box::<u16>(BitPacking::unpack_single2::<WIDTH>(
-                        array_ref![packed, 0, 192],
-                        i,
-                    ));
-                }
-            });
-        });
     }
 }
 
diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index f4ea35d..f97db6f 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -3,7 +3,7 @@ use core::mem::size_of;
 use paste::paste;
 use seq_macro::seq;
 
-use crate::{pack, seq_t, seq_start_bits, seq_lanes, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
+use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
 
 pub struct BitPackWidth<const W: usize>;
 pub trait SupportedBitPackWidth<T> {}
@@ -64,19 +64,14 @@ pub trait BitPacking: FastLanes {
         // decompression can be fused efficiently with encodings like delta and RLE.
         //
         // First step, we need to get the lane and row for interpretation #1 above.
-        let lane = index % Self::LANES;
-        let row = {
-            // This is the inverse of the `index` function from the pack/unpack macros:
-            //     fn index(row: usize, lane: usize) -> usize {
-            //         let o = row / 8;
-            //         let s = row % 8;
-            //         (FL_ORDER[o] * 16) + (s * 128) + lane
-            //     }
-            let s = index / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
-            let fl_order = (index - s * 128 - lane) / 16; // value of FL_ORDER[o]
-            let o = FL_ORDER[fl_order]; // because this transposition is invertible!
-            o * 8 + s
-        };
+        let (lane, row): (usize, usize) = seq!(I in 0..1024 {
+            match index {
+                #(I =>
+                    Self::packed_lane_and_row::<I>(),
+                )*
+                _ => unreachable!("Unsupported index: {}", index)
+            }
+        });
 
         // From the row, we can get the correct start bit within the lane.
         let start_bit = row * W;
@@ -100,11 +95,6 @@ pub trait BitPacking: FastLanes {
         (lo | hi) & mask
     }
 
-    /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
-    fn unpack_single2<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
-    where
-        BitPackWidth<W>: SupportedBitPackWidth<Self>;
-
     fn packed_lane_and_row<const INDEX: usize>() -> (usize, usize) {
         // We can think of the input array as effectively a row-major, left-to-right
         // 2-D array of with `Self::LANES` columns and `Self::T` rows.
@@ -240,73 +230,6 @@ macro_rules! impl_packing {
                     })
                 }
 
-                /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
-                fn unpack_single2<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
-                where
-                    BitPackWidth<W>: SupportedBitPackWidth<Self>,
-                {
-                    // Special case for W=0, since there's only one possible value.
-                    if W == 0 {
-                        return 0 as $T;
-                    }
-
-                    // We can think of the input array as effectively a row-major, left-to-right
-                    // 2-D array of with `Self::LANES` columns and `Self::T` rows.
-                    //
-                    // Meanwhile, we can think of the packed array as either:
-                    //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
-                    //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
-                    //
-                    // Bitpacking involves a transposition of the input array ordering, such that
-                    // decompression can be fused efficiently with encodings like delta and RLE.
-                    //
-                    // First step, we need to get the lane and row for interpretation #1 above.
-                    let (lane, row): (usize, usize) = seq!(I in 0..1024 {
-                        match index {
-                            #(I =>
-                                Self::packed_lane_and_row::<I>(),
-                            )*
-                            _ => unreachable!("Unsupported index: {}", index)
-                        }
-                    });
-
-                    // From the row, we can get the correct start bit within the lane.
-                    let start_bit = row * W;
-                    let start_word = (start_bit) / Self::T;
-                    let end_word = (start_bit + W - 1) / Self::T;
-                    let one_word = (start_word == end_word);
-
-                    #[inline]
-                    fn mask(width: usize) -> $T {
-                        if width == $T::T { <$T>::MAX } else { (1 << (width % $T::T)) - 1 }
-                    }
-                    
-                    seq_t!(W, ROW in $T, |W| {
-                        match start_bit {
-                            #(ROW * W => {
-                                seq_lanes!(LANE in $T {
-                                    match lane {
-                                        #(LANE => {
-                                            match one_word {
-                                                true => unsafe { $T::unpack_single_const_helper::<LANE, START_BIT, {true}>(packed, mask(W)) },
-                                                false => unsafe { $T::unpack_single_const_helper::<LANE, START_BIT, {false}>(packed, mask(W)) },
-                                            }
-                                        })*
-                                        _ => unreachable!(
-                                            "Unsupported lane: {}",
-                                            lane
-                                        )
-                                    }
-                                })
-                            })*
-                            _ => unreachable!(
-                                "Unsupported start_bit: {}",
-                                start_bit
-                            )
-                        }
-                    })
-                }
-
                 unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self {
                     let packed_len = 128 * width / size_of::<Self>();
                     debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len);
@@ -335,10 +258,10 @@ macro_rules! impl_packing {
     };
 }
 
-//impl_packing!(u8);
+impl_packing!(u8);
 impl_packing!(u16);
-//impl_packing!(u32);
-//impl_packing!(u64);
+impl_packing!(u32);
+impl_packing!(u64);
 
 #[cfg(test)]
 mod test {
diff --git a/src/lib.rs b/src/lib.rs
index bf5f689..4bf858d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,19 +44,6 @@ macro_rules! seq_t {
     ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
     ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
     ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
-    ($W:expr, $ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)};
-    ($W:expr, $ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
-    ($W:expr, $ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
-    ($W:expr, $ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
-}
-
-// Macro for repeating a code block T::LANES times.
-#[macro_export]
-macro_rules! seq_lanes {
-    ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..128 $body)};
-    ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
-    ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
-    ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
 }
 
 #[cfg(test)]

From e2471f495c877fee25e65610b3d0574f110937b9 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Mon, 15 Jul 2024 17:51:49 +0100
Subject: [PATCH 04/16] wip

---
 src/bitpacking.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index f97db6f..1e4c04d 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -95,6 +95,7 @@ pub trait BitPacking: FastLanes {
         (lo | hi) & mask
     }
 
+    #[must_use]
     fn packed_lane_and_row<const INDEX: usize>() -> (usize, usize) {
         // We can think of the input array as effectively a row-major, left-to-right
         // 2-D array of with `Self::LANES` columns and `Self::T` rows.
@@ -123,27 +124,28 @@ pub trait BitPacking: FastLanes {
         (lane, row)
     }
 
-    /// Unpacks a single element at the provided LANE and START_BIT from a packed array of 1024 `W` bit elements,
-    /// where `W` is runtime-known instead of compile-time known.
+    /// Unpacks a single element at the provided `START_BIT` of the (runtime-specified) lane from a
+    /// packed array of 1024 `W` bit elements, where `W` is runtime-known (specified via the mask)
+    /// instead of compile-time known. The point of this function is to produce a reusable block of
+    /// code that balances compile-time optimization with code size.
     ///
     /// # Safety
     /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W`
     /// is the packed width. The output slice must be of exactly length 1024.
     /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
-    unsafe fn unpack_single_const_helper<const LANE: usize, const START_BIT: usize, const ONE_WORD: bool>(
-        packed: &[Self], mask: Self) -> Self
+    unsafe fn unpack_single_const_helper<const START_BIT: usize, const ONE_WORD: bool>(
+        packed: &[Self], lane: usize, mask: Self) -> Self
     where
-        Pred< { START_BIT < Self::T * Self::T }> : Satisfied,
-        Pred< { LANE < Self::LANES }> : Satisfied
+        Pred< { START_BIT < Self::T * Self::T }> : Satisfied
     {
         let start_word = START_BIT / Self::T;
         let lo_shift = START_BIT % Self::T;
-        let lo = packed[Self::LANES * start_word + LANE] >> lo_shift;
+        let lo = packed[Self::LANES * start_word + lane] >> lo_shift;
         if ONE_WORD {
             lo & mask
         } else {
             let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false
-            let hi = packed[Self::LANES * (start_word + 1) + LANE] << hi_shift;
+            let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift;
             (lo | hi) & mask
         }
     }

From b536d27a1b42a0f440a8e45427b4084c99aa9d10 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Tue, 16 Jul 2024 12:50:07 +0100
Subject: [PATCH 05/16] wip

---
 src/bitpacking.rs | 111 ++++++++++++++++++++++++----------------------
 src/lib.rs        |   1 +
 2 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 1e4c04d..d2a7fab 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -46,54 +46,7 @@ pub trait BitPacking: FastLanes {
     /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
     fn unpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
     where
-        BitPackWidth<W>: SupportedBitPackWidth<Self>,
-    {
-        // Special case for W=0, since there's only one possible value.
-        if W == 0 {
-            return Self::zero();
-        }
-
-        // We can think of the input array as effectively a row-major, left-to-right
-        // 2-D array of with `Self::LANES` columns and `Self::T` rows.
-        //
-        // Meanwhile, we can think of the packed array as either:
-        //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
-        //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
-        //
-        // Bitpacking involves a transposition of the input array ordering, such that
-        // decompression can be fused efficiently with encodings like delta and RLE.
-        //
-        // First step, we need to get the lane and row for interpretation #1 above.
-        let (lane, row): (usize, usize) = seq!(I in 0..1024 {
-            match index {
-                #(I =>
-                    Self::packed_lane_and_row::<I>(),
-                )*
-                _ => unreachable!("Unsupported index: {}", index)
-            }
-        });
-
-        // From the row, we can get the correct start bit within the lane.
-        let start_bit = row * W;
-
-        // We need to read one or two T-bit words from the lane, depending on how our
-        // target W-bit value overlaps with the T-bit words. To avoid a branch, we
-        // always read two T-bit words, and then shift/mask as needed.
-        let lo_word = start_bit / Self::T;
-        let lo_shift = start_bit % Self::T;
-        let lo = packed[Self::LANES * lo_word + lane] >> lo_shift;
-
-        let hi_word = (start_bit + W - 1) / Self::T;
-        let hi_shift = (Self::T - lo_shift) % Self::T;
-        let hi = packed[Self::LANES * hi_word + lane] << hi_shift;
-
-        let mask: Self = if W == Self::T {
-            Self::max_value()
-        } else {
-            ((Self::one()) << (W % Self::T)) - Self::one()
-        };
-        (lo | hi) & mask
-    }
+        BitPackWidth<W>: SupportedBitPackWidth<Self>;
 
     #[must_use]
     fn packed_lane_and_row<const INDEX: usize>() -> (usize, usize) {
@@ -124,6 +77,15 @@ pub trait BitPacking: FastLanes {
         (lane, row)
     }
 
+    #[must_use]
+    fn mask<const W: usize>() -> Self {
+        return if W == Self::T {
+            Self::max_value()
+        } else {
+            ((Self::one()) << (W % Self::T)) - Self::one()
+        };
+    }
+
     /// Unpacks a single element at the provided `START_BIT` of the (runtime-specified) lane from a
     /// packed array of 1024 `W` bit elements, where `W` is runtime-known (specified via the mask)
     /// instead of compile-time known. The point of this function is to produce a reusable block of
@@ -136,7 +98,7 @@ pub trait BitPacking: FastLanes {
     unsafe fn unpack_single_const_helper<const START_BIT: usize, const ONE_WORD: bool>(
         packed: &[Self], lane: usize, mask: Self) -> Self
     where
-        Pred< { START_BIT < Self::T * Self::T }> : Satisfied
+        Pred< { START_BIT < Self::BITS_PER_LANE }> : Satisfied
     {
         let start_word = START_BIT / Self::T;
         let lo_shift = START_BIT % Self::T;
@@ -232,6 +194,51 @@ macro_rules! impl_packing {
                     })
                 }
 
+                /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
+                fn unpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
+                where
+                    BitPackWidth<W>: SupportedBitPackWidth<Self>,
+                {
+                    // Special case for W=0, since there's only one possible value.
+                    if W == 0 {
+                        return 0 as $T;
+                    }
+
+                    // We can think of the input array as effectively a row-major, left-to-right
+                    // 2-D array of with `Self::LANES` columns and `Self::T` rows.
+                    //
+                    // Meanwhile, we can think of the packed array as either:
+                    //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
+                    //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
+                    //
+                    // Bitpacking involves a transposition of the input array ordering, such that
+                    // decompression can be fused efficiently with encodings like delta and RLE.
+                    //
+                    // First step, we need to get the lane and row for interpretation #1 above.
+                    let (lane, row): (usize, usize) = seq!(I in 0..1024 {
+                        match index {
+                            #(I =>
+                                Self::packed_lane_and_row::<I>(),
+                            )*
+                            _ => unreachable!("Unsupported index: {}", index)
+                        }
+                    });
+
+                    let mask = Self::mask::<W>();
+
+                    seq_t!(ROW in $T {
+                        match row {
+                            #(ROW => {
+                                const start_bit = ROW * W;
+                                const remaining_bits: usize = $T::T - (start_bit % $T::T);
+                                const one_word: bool = remaining_bits <= W;
+                                return Self::unpack_single_const_helper::<start_bit, one_word>(packed, lane, mask);
+                            },)*
+                            _ => unreachable!("Unsupported row: {}", row)
+                        }
+                    })
+                }
+
                 unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self {
                     let packed_len = 128 * width / size_of::<Self>();
                     debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len);
@@ -260,10 +267,10 @@ macro_rules! impl_packing {
     };
 }
 
-impl_packing!(u8);
-impl_packing!(u16);
+//impl_packing!(u8);
+//impl_packing!(u16);
 impl_packing!(u32);
-impl_packing!(u64);
+// impl_packing!(u64);
 
 #[cfg(test)]
 mod test {
diff --git a/src/lib.rs b/src/lib.rs
index 4bf858d..fb18aaf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,6 +24,7 @@ pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7];
 pub trait FastLanes: Sized + Unsigned + PrimInt {
     const T: usize = size_of::<Self>() * 8;
     const LANES: usize = 1024 / Self::T;
+    const BITS_PER_LANE: usize = Self::T * Self::T;
 }
 
 impl FastLanes for u8 {}

From aecf42fcb25f899a1800ba9c64c8bb8b5ebbded4 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Wed, 17 Jul 2024 14:13:17 +0100
Subject: [PATCH 06/16] I hate macros

---
 src/bitpacking.rs | 107 ++--------------------------------------------
 src/lib.rs        |   1 -
 src/macros.rs     |  81 +++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 104 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index d2a7fab..a06c267 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -1,9 +1,8 @@
 use arrayref::{array_mut_ref, array_ref};
 use core::mem::size_of;
 use paste::paste;
-use seq_macro::seq;
 
-use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
+use crate::{pack, seq_t, unpack, unpack_single, FastLanes, Pred, Satisfied};
 
 pub struct BitPackWidth<const W: usize>;
 pub trait SupportedBitPackWidth<T> {}
@@ -48,70 +47,6 @@ pub trait BitPacking: FastLanes {
     where
         BitPackWidth<W>: SupportedBitPackWidth<Self>;
 
-    #[must_use]
-    fn packed_lane_and_row<const INDEX: usize>() -> (usize, usize) {
-        // We can think of the input array as effectively a row-major, left-to-right
-        // 2-D array of with `Self::LANES` columns and `Self::T` rows.
-        //
-        // Meanwhile, we can think of the packed array as either:
-        //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
-        //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
-        //
-        // Bitpacking involves a transposition of the input array ordering, such that
-        // decompression can be fused efficiently with encodings like delta and RLE.
-        //
-        // First step, we need to get the lane and row for interpretation #1 above.
-        let lane = INDEX % Self::LANES;
-        let row = {
-            // This is the inverse of the `index` function from the pack/unpack macros:
-            //     fn index(row: usize, lane: usize) -> usize {
-            //         let o = row / 8;
-            //         let s = row % 8;
-            //         (FL_ORDER[o] * 16) + (s * 128) + lane
-            //     }
-            let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
-            let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
-            let o = FL_ORDER[fl_order]; // because this transposition is invertible!
-            o * 8 + s
-        };
-        (lane, row)
-    }
-
-    #[must_use]
-    fn mask<const W: usize>() -> Self {
-        return if W == Self::T {
-            Self::max_value()
-        } else {
-            ((Self::one()) << (W % Self::T)) - Self::one()
-        };
-    }
-
-    /// Unpacks a single element at the provided `START_BIT` of the (runtime-specified) lane from a
-    /// packed array of 1024 `W` bit elements, where `W` is runtime-known (specified via the mask)
-    /// instead of compile-time known. The point of this function is to produce a reusable block of
-    /// code that balances compile-time optimization with code size.
-    ///
-    /// # Safety
-    /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W`
-    /// is the packed width. The output slice must be of exactly length 1024.
-    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
-    unsafe fn unpack_single_const_helper<const START_BIT: usize, const ONE_WORD: bool>(
-        packed: &[Self], lane: usize, mask: Self) -> Self
-    where
-        Pred< { START_BIT < Self::BITS_PER_LANE }> : Satisfied
-    {
-        let start_word = START_BIT / Self::T;
-        let lo_shift = START_BIT % Self::T;
-        let lo = packed[Self::LANES * start_word + lane] >> lo_shift;
-        if ONE_WORD {
-            lo & mask
-        } else {
-            let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false
-            let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift;
-            (lo | hi) & mask
-        }
-    }
-
     /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements,
     /// where `W` is runtime-known instead of compile-time known.
     ///
@@ -195,47 +130,13 @@ macro_rules! impl_packing {
                 }
 
                 /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
+                #[inline(never)]
                 fn unpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
                 where
                     BitPackWidth<W>: SupportedBitPackWidth<Self>,
                 {
-                    // Special case for W=0, since there's only one possible value.
-                    if W == 0 {
-                        return 0 as $T;
-                    }
-
-                    // We can think of the input array as effectively a row-major, left-to-right
-                    // 2-D array of with `Self::LANES` columns and `Self::T` rows.
-                    //
-                    // Meanwhile, we can think of the packed array as either:
-                    //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
-                    //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
-                    //
-                    // Bitpacking involves a transposition of the input array ordering, such that
-                    // decompression can be fused efficiently with encodings like delta and RLE.
-                    //
-                    // First step, we need to get the lane and row for interpretation #1 above.
-                    let (lane, row): (usize, usize) = seq!(I in 0..1024 {
-                        match index {
-                            #(I =>
-                                Self::packed_lane_and_row::<I>(),
-                            )*
-                            _ => unreachable!("Unsupported index: {}", index)
-                        }
-                    });
-
-                    let mask = Self::mask::<W>();
-
-                    seq_t!(ROW in $T {
-                        match row {
-                            #(ROW => {
-                                const start_bit = ROW * W;
-                                const remaining_bits: usize = $T::T - (start_bit % $T::T);
-                                const one_word: bool = remaining_bits <= W;
-                                return Self::unpack_single_const_helper::<start_bit, one_word>(packed, lane, mask);
-                            },)*
-                            _ => unreachable!("Unsupported row: {}", row)
-                        }
+                    unpack_single!($T, W, packed, index, |$elem| {
+                        $elem
                     })
                 }
 
diff --git a/src/lib.rs b/src/lib.rs
index fb18aaf..4bf858d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -24,7 +24,6 @@ pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7];
 pub trait FastLanes: Sized + Unsigned + PrimInt {
     const T: usize = size_of::<Self>() * 8;
     const LANES: usize = 1024 / Self::T;
-    const BITS_PER_LANE: usize = Self::T * Self::T;
 }
 
 impl FastLanes for u8 {}
diff --git a/src/macros.rs b/src/macros.rs
index e725e42..0be7072 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -173,6 +173,87 @@ macro_rules! unpack {
     };
 }
 
+#[macro_export]
+macro_rules! unpack_single {
+    ($T:ty, $W:expr, $packed:expr, $index:expr, | $_1:tt $elem:ident | $($body:tt)*) => {
+        macro_rules! __kernel__ {( $_1 $elem:ident ) => ( $($body)* )}
+        {
+            use $crate::{seq_t, FL_ORDER, FastLanes};
+            use paste::paste;
+
+            // The number of bits of T.
+            const T: usize = <$T>::T;
+
+            // This calculation of (lane, row) is the inverse of the `index` function from the
+            // pack/unpack macros
+            #[inline(always)]
+            const fn lane_and_row<const INDEX: usize>() -> (usize, usize) {
+                const lane: usize = INDEX % <$T>::LANES;
+                const row: usize = {
+                    let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
+                    let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
+                    let o = FL_ORDER[fl_order]; // because this transposition is invertible!
+                    o * 8 + s
+                };
+                (lane, row)
+            }
+
+            fn unpack_single_const_helper<const START_BIT: usize, const ONE_WORD: bool>(
+                packed: &[$T], lane: usize, mask: Self) -> Self
+            where
+                Pred< { START_BIT < T * T }> : Satisfied
+            {
+                let start_word = START_BIT / Self::T;
+                let lo_shift = START_BIT % Self::T;
+                let lo = packed[Self::LANES * start_word + lane] >> lo_shift;
+                if ONE_WORD {
+                    lo & mask
+                } else {
+                    let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false
+                    let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift;
+                    (lo | hi) & mask
+                }
+            }
+
+            if $W == 0 {
+                // Special case for W=0, we just need to zero the output.
+                // We'll still respect the iteration order in case the kernel has side effects.
+                let zero: $T = 0;
+                __kernel__!(zero);
+            } else {
+                let (lane, row): (usize, usize) = seq!(I in 0..1024 {
+                        match index {
+                            #(I =>
+                                lane_and_row::<I>(),
+                            )*
+                            _ => unreachable!("Unsupported index: {}", index)
+                        }
+                    });
+
+                // Special case for W=T, we can just copy the packed value directly to the output.
+                if $W == T {
+                    let val = $packed[<$T>::LANES * row + lane];
+                    __kernel__!(val);
+                } else {
+                    const mask: usize = (1 << ($W % T)) - 1;
+                    paste!(seq_t!(ROW in $T {
+                        match row {
+                            #(ROW => {
+                                const START_BIT: usize = ROW * $W;
+                                const REMAINING_BITS: usize = T - (START_BIT % T);
+                                const ONE_WORD: bool = REMAINING_BITS <= $W;
+                                let val = unpack_single_const_helper::<START_BIT, ONE_WORD>($packed, lane, mask);
+                                __kernel__!(val);
+                            },)*
+                            _ => unreachable!("Unsupported row: {}", row)
+                        }
+                    }))
+                }
+            }
+        }
+    };
+}
+
 #[cfg(test)]
 mod test {
     use crate::{BitPacking, FastLanes};

From 3950913e59588acdaefd0f3559952ebe0fcdb578 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Wed, 17 Jul 2024 14:31:22 +0100
Subject: [PATCH 07/16] fuck kernels

---
 src/bitpacking.rs |   4 +-
 src/macros.rs     | 136 ++++++++++++++++++++++------------------------
 2 files changed, 67 insertions(+), 73 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index a06c267..cf8419a 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -135,9 +135,7 @@ macro_rules! impl_packing {
                 where
                     BitPackWidth<W>: SupportedBitPackWidth<Self>,
                 {
-                    unpack_single!($T, W, packed, index, |$elem| {
-                        $elem
-                    })
+                    unpack_single!($T, W, packed, index);
                 }
 
                 unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self {
diff --git a/src/macros.rs b/src/macros.rs
index 0be7072..5b34cf0 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -175,83 +175,79 @@ macro_rules! unpack {
 
 #[macro_export]
 macro_rules! unpack_single {
-    ($T:ty, $W:expr, $packed:expr, $index:expr, | $_1:tt $elem:ident | $($body:tt)*) => {
-        macro_rules! __kernel__ {( $_1 $elem:ident ) => ( $($body)* )}
-        {
-            use $crate::{seq_t, FL_ORDER, FastLanes};
-            use paste::paste;
-
-            // The number of bits of T.
-            const T: usize = <$T>::T;
+    ($T:ty, $W:expr, $packed:expr, $index:expr) => {
+        use $crate::{FastLanes, FL_ORDER, seq_t};
+        use seq_macro::seq;
+        use paste::paste;
+
+        // The number of bits of T.
+        const T: usize = <$T>::T;
+
+        // This calculation of (lane, row) is the inverse of the `index` function from the
+        // pack/unpack macros
+        #[inline(always)]
+        fn lane_and_row<const INDEX: usize>() -> (usize, usize)
+        where Pred< { INDEX < 1024 }> : Satisfied {
+            const lane: usize = INDEX % <$T>::LANES;
+            const row: usize = {
+                let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
+                let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
+                let o = FL_ORDER[fl_order]; // because this transposition is invertible!
+                o * 8 + s
+            };
+            (lane, row)
+        }
 
-            // This calculation of (lane, row) is the inverse of the `index` function from the
-            // pack/unpack macros
-            #[inline(always)]
-            const fn lane_and_row<const INDEX: usize>() -> (usize, usize) {
-                const lane: usize = INDEX % <$T>::LANES;
-                const row: usize = {
-                    let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
-                    let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
-                    let o = FL_ORDER[fl_order]; // because this transposition is invertible!
-                    o * 8 + s
-                };
-                (lane, row)
+        fn unpack_single_const<const START_BIT: usize, const ONE_WORD: bool>(
+            packed: &[$T], lane: usize, mask: $T) -> $T
+        where
+            Pred< { START_BIT < T * T }> : Satisfied
+        {
+            let start_word = START_BIT / T;
+            let lo_shift = START_BIT % T;
+            let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift;
+            if ONE_WORD {
+                lo & mask
+            } else {
+                let hi_shift = T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false
+                let hi = packed[<$T>::LANES * (start_word + 1) + lane] << hi_shift;
+                (lo | hi) & mask
             }
+        }
 
-            fn unpack_single_const_helper<const START_BIT: usize, const ONE_WORD: bool>(
-                packed: &[$T], lane: usize, mask: Self) -> Self
-            where
-                Pred< { START_BIT < T * T }> : Satisfied
-            {
-                let start_word = START_BIT / Self::T;
-                let lo_shift = START_BIT % Self::T;
-                let lo = packed[Self::LANES * start_word + lane] >> lo_shift;
-                if ONE_WORD {
-                    lo & mask
-                } else {
-                    let hi_shift = Self::T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false
-                    let hi = packed[Self::LANES * (start_word + 1) + lane] << hi_shift;
-                    (lo | hi) & mask
+        if $W == 0 {
+            // Special case for W=0, we just need to zero the output.
+            // We'll still respect the iteration order in case the kernel has side effects.
+            return 0 as $T;
+        }
+
+        let (lane, row): (usize, usize) = seq!(I in 0..1024 {
+                match $index {
+                    #(I =>
+                        lane_and_row::<I>(),
+                    )*
+                    _ => unreachable!("Unsupported index: {}", $index)
                 }
-            }
+            });
 
-            if $W == 0 {
-                // Special case for W=0, we just need to zero the output.
-                // We'll still respect the iteration order in case the kernel has side effects.
-                let zero: $T = 0;
-                __kernel__!(zero);
-            } else {
-                let (lane, row): (usize, usize) = seq!(I in 0..1024 {
-                        match index {
-                            #(I =>
-                                lane_and_row::<I>(),
-                            )*
-                            _ => unreachable!("Unsupported index: {}", index)
-                        }
-                    });
+        // Special case for W=T, we can just copy the packed value directly to the output.
+        if $W == T {
+            return $packed[<$T>::LANES * row + lane];
+        }
 
-                // Special case for W=T, we can just copy the packed value directly to the output.
-                if $W == T {
-                    let val = $packed[<$T>::LANES * row + lane];
-                    __kernel__!(val);
-                } else {
-                    const mask: usize = (1 << ($W % T)) - 1;
-                    paste!(seq_t!(ROW in $T {
-                        match row {
-                            #(ROW => {
-                                const START_BIT: usize = ROW * $W;
-                                const REMAINING_BITS: usize = T - (START_BIT % T);
-                                const ONE_WORD: bool = REMAINING_BITS <= $W;
-                                let val = unpack_single_const_helper::<START_BIT, ONE_WORD>($packed, lane, mask);
-                                __kernel__!(val);
-                            },)*
-                            _ => unreachable!("Unsupported row: {}", row)
-                        }
-                    }))
-                }
+        const mask: $T = (1 << ($W % T)) - 1;
+        paste!(seq_t!(ROW in $T {
+            match row {
+                #(ROW => {
+                    const START_BIT: usize = ROW * $W;
+                    const REMAINING_BITS: usize = T - (START_BIT % T);
+                    const ONE_WORD: bool = REMAINING_BITS <= $W;
+                    return unpack_single_const::<{START_BIT}, {ONE_WORD}>($packed, lane, mask);
+                },)*
+                _ => unreachable!("Unsupported row: {}", row)
             }
-        }
-    };
+        }))
+    }
 }
 
 #[cfg(test)]

From 9ea034e24d8d3e0c29a6332be6acc662f8dae9ad Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Wed, 17 Jul 2024 14:53:58 +0100
Subject: [PATCH 08/16] this is dumb and I hate it

---
 src/macros.rs | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/macros.rs b/src/macros.rs
index 5b34cf0..b3d8376 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -183,22 +183,7 @@ macro_rules! unpack_single {
         // The number of bits of T.
         const T: usize = <$T>::T;
 
-        // This calculation of (lane, row) is the inverse of the `index` function from the
-        // pack/unpack macros
-        #[inline(always)]
-        fn lane_and_row<const INDEX: usize>() -> (usize, usize)
-        where Pred< { INDEX < 1024 }> : Satisfied {
-            const lane: usize = INDEX % <$T>::LANES;
-            const row: usize = {
-                let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
-                let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
-                let o = FL_ORDER[fl_order]; // because this transposition is invertible!
-                o * 8 + s
-            };
-            (lane, row)
-        }
-
-        fn unpack_single_const<const START_BIT: usize, const ONE_WORD: bool>(
+        fn unpack_single_helper<const START_BIT: usize, const ONE_WORD: bool>(
             packed: &[$T], lane: usize, mask: $T) -> $T
         where
             Pred< { START_BIT < T * T }> : Satisfied
@@ -221,11 +206,20 @@ macro_rules! unpack_single {
             return 0 as $T;
         }
 
-        let (lane, row): (usize, usize) = seq!(I in 0..1024 {
+        let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 {
                 match $index {
-                    #(I =>
-                        lane_and_row::<I>(),
-                    )*
+                    #(INDEX => {
+                        // This calculation of (lane, row) is the inverse of the `index` function from the
+                        // pack/unpack macros
+                        const lane: usize = INDEX % <$T>::LANES;
+                        const row: usize = {
+                            let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
+                            let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
+                            let o = FL_ORDER[fl_order]; // because this transposition is invertible!
+                            o * 8 + s
+                        };
+                        (lane, row)
+                    })*
                     _ => unreachable!("Unsupported index: {}", $index)
                 }
             });
@@ -242,7 +236,7 @@ macro_rules! unpack_single {
                     const START_BIT: usize = ROW * $W;
                     const REMAINING_BITS: usize = T - (START_BIT % T);
                     const ONE_WORD: bool = REMAINING_BITS <= $W;
-                    return unpack_single_const::<{START_BIT}, {ONE_WORD}>($packed, lane, mask);
+                    return unpack_single_helper::<{START_BIT}, {ONE_WORD}>($packed, lane, mask);
                 },)*
                 _ => unreachable!("Unsupported row: {}", row)
             }

From 8a1ce5b16f130537170e34049fdff55814131571 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Thu, 18 Jul 2024 13:48:35 +0100
Subject: [PATCH 09/16] everything const, compiles and runs

---
 src/bitpacking.rs | 82 ++++++++++++++++++++++++++++++++++++-----------
 src/macros.rs     | 74 ++++++++++++++----------------------------
 2 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index cf8419a..0d050f4 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -1,8 +1,9 @@
 use arrayref::{array_mut_ref, array_ref};
 use core::mem::size_of;
 use paste::paste;
+use seq_macro::seq;
 
-use crate::{pack, seq_t, unpack, unpack_single, FastLanes, Pred, Satisfied};
+use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
 
 pub struct BitPackWidth<const W: usize>;
 pub trait SupportedBitPackWidth<T> {}
@@ -130,33 +131,78 @@ macro_rules! impl_packing {
                 }
 
                 /// Unpacks a single element at the provided index from a packed array of 1024 `W` bit elements.
-                #[inline(never)]
                 fn unpack_single<const W: usize>(packed: &[Self; 1024 * W / Self::T], index: usize) -> Self
                 where
                     BitPackWidth<W>: SupportedBitPackWidth<Self>,
                 {
-                    unpack_single!($T, W, packed, index);
+                    unsafe {
+                        Self::unchecked_unpack_single(W, packed, index)
+                    }
                 }
 
-                unsafe fn unchecked_unpack_single(width: usize, input: &[Self], index: usize) -> Self {
+                #[allow(arithmetic_overflow, unused_comparisons)]
+                unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self {
                     let packed_len = 128 * width / size_of::<Self>();
-                    debug_assert_eq!(input.len(), packed_len, "Input buffer must be of size {}", packed_len);
+                    debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len);
                     debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);
-                    debug_assert!(index <= 1024, "index must be less than or equal to 1024");
+                    debug_assert!(index < 1024, "index must be less than or equal to 1024");
+
+                    let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 {
+                        match index {
+                            #(INDEX => {
+                                // This calculation of (lane, row) is the inverse of the `index` function from the
+                                // pack/unpack macros
+                                const LANE: usize = INDEX % <$T>::LANES;
+                                const ROW: usize = {
+                                    let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
+                                    let fl_order = (INDEX - s * 128 - LANE) / 16; // value of FL_ORDER[o]
+                                    let o = FL_ORDER[fl_order]; // because this transposition is invertible!
+                                    o * 8 + s
+                                };
+                                (LANE, ROW)
+                            })*
+                            _ => unreachable!("Unsupported index: {}", index)
+                        }
+                    });
 
                     seq_t!(W in $T {
                         match width {
                             #(W => {
-                                Self::unpack_single::<W>(
-                                    array_ref![input, 0, 1024 * W / <$T>::T],
-                                    index
-                                )
-                            })*
+                                if W == 0 {
+                                    // Special case for W=0, we just need to zero the output.
+                                    return 0 as $T;
+                                }
+                                seq_t!(ROW in $T {
+                                    match row {
+                                        #(ROW => {
+                                            const MASK: $T = (1 << (W % <$T>::T)) - 1;
+                                            const START_BIT: usize = ROW * W;
+
+                                            const START_WORD: usize = START_BIT / <$T>::T;
+                                             // bits to shift out of lo word
+                                            const LO_SHIFT: usize = START_BIT % <$T>::T;
+                                            // remaining bits in the lo word == bits to shift from hi word
+                                            const REMAINING_BITS: usize = <$T>::T - LO_SHIFT;
+
+                                            let lo = packed[<$T>::LANES * START_WORD + lane] >> LO_SHIFT;
+                                            return if REMAINING_BITS >= W {
+                                                // in this case we will mask out all bits of hi word
+                                                lo & MASK
+                                            } else {
+                                                // guaranteed that lo_shift > 0 and thus remaining_bits < T
+                                                let hi = packed[<$T>::LANES * (START_WORD + 1) + lane] << REMAINING_BITS;
+                                                (lo | hi) & MASK
+                                            }
+                                        },)*
+                                        _ => unreachable!("Unsupported row: {}", row)
+                                    }
+                                })
+                            },)*
                             // seq_t has exclusive upper bound
-                            Self::T => Self::unpack_single::<{ Self::T }>(
-                                array_ref![input, 0, 1024],
-                                index
-                            ),
+                            Self::T => {
+                                // Special case for W=T, we can just read the value directly
+                                return packed[<$T>::LANES * row + lane];
+                            },
                             _ => unreachable!("Unsupported width: {}", width)
                         }
                     })
@@ -166,10 +212,10 @@ macro_rules! impl_packing {
     };
 }
 
-//impl_packing!(u8);
-//impl_packing!(u16);
+impl_packing!(u8);
+impl_packing!(u16);
 impl_packing!(u32);
-// impl_packing!(u64);
+impl_packing!(u64);
 
 #[cfg(test)]
 mod test {
diff --git a/src/macros.rs b/src/macros.rs
index b3d8376..5bf02c4 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -175,73 +175,47 @@ macro_rules! unpack {
 
 #[macro_export]
 macro_rules! unpack_single {
-    ($T:ty, $W:expr, $packed:expr, $index:expr) => {
-        use $crate::{FastLanes, FL_ORDER, seq_t};
-        use seq_macro::seq;
+    // $W must be constant / compile-time known
+    ($T:ty, $W:expr, $packed:expr, $row:expr, $lane:expr) => {{
+        use $crate::{FastLanes, seq_t};
         use paste::paste;
 
         // The number of bits of T.
         const T: usize = <$T>::T;
 
-        fn unpack_single_helper<const START_BIT: usize, const ONE_WORD: bool>(
-            packed: &[$T], lane: usize, mask: $T) -> $T
-        where
-            Pred< { START_BIT < T * T }> : Satisfied
-        {
-            let start_word = START_BIT / T;
-            let lo_shift = START_BIT % T;
-            let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift;
-            if ONE_WORD {
-                lo & mask
-            } else {
-                let hi_shift = T - lo_shift; // guaranteed that lo_shift > 0 if ONE_WORD == false
-                let hi = packed[<$T>::LANES * (start_word + 1) + lane] << hi_shift;
-                (lo | hi) & mask
-            }
-        }
-
         if $W == 0 {
             // Special case for W=0, we just need to zero the output.
-            // We'll still respect the iteration order in case the kernel has side effects.
             return 0 as $T;
+        } else if $W == T {
+            return $packed[<$T>::LANES * $row + $lane];
         }
 
-        let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 {
-                match $index {
-                    #(INDEX => {
-                        // This calculation of (lane, row) is the inverse of the `index` function from the
-                        // pack/unpack macros
-                        const lane: usize = INDEX % <$T>::LANES;
-                        const row: usize = {
-                            let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
-                            let fl_order = (INDEX - s * 128 - lane) / 16; // value of FL_ORDER[o]
-                            let o = FL_ORDER[fl_order]; // because this transposition is invertible!
-                            o * 8 + s
-                        };
-                        (lane, row)
-                    })*
-                    _ => unreachable!("Unsupported index: {}", $index)
-                }
-            });
-
-        // Special case for W=T, we can just copy the packed value directly to the output.
-        if $W == T {
-            return $packed[<$T>::LANES * row + lane];
-        }
-
-        const mask: $T = (1 << ($W % T)) - 1;
         paste!(seq_t!(ROW in $T {
-            match row {
+            match $row {
                 #(ROW => {
+                    const MASK: $T = (1 << ($W % T)) - 1;
                     const START_BIT: usize = ROW * $W;
-                    const REMAINING_BITS: usize = T - (START_BIT % T);
-                    const ONE_WORD: bool = REMAINING_BITS <= $W;
-                    return unpack_single_helper::<{START_BIT}, {ONE_WORD}>($packed, lane, mask);
+
+                    const START_WORD: usize = START_BIT / T;
+                     // bits to shift out of lo word
+                    const LO_SHIFT: usize = START_BIT % T;
+                    // remaining bits in the lo word == bits to shift from hi word
+                    const REMAINING_BITS: usize = T - LO_SHIFT;
+
+                    let lo = packed[<$T>::LANES * START_WORD + $lane] >> LO_SHIFT;
+                    return if REMAINING_BITS >= W {
+                        // in this case we will mask out all bits of hi word
+                        lo & MASK
+                    } else {
+                        // guaranteed that lo_shift > 0 and thus remaining_bits < T
+                        let hi = packed[<$T>::LANES * (START_WORD + 1) + $lane] << REMAINING_BITS;
+                        (lo | hi) & MASK
+                    }
                 },)*
                 _ => unreachable!("Unsupported row: {}", row)
             }
         }))
-    }
+    }}
 }
 
 #[cfg(test)]

From 7432bcff76dd9849f05f4d79eeae781a42120839 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Thu, 18 Jul 2024 14:22:07 +0100
Subject: [PATCH 10/16] faster smaller code

---
 src/bitpacking.rs | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 0d050f4..f7b60e2 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -172,31 +172,22 @@ macro_rules! impl_packing {
                                     // Special case for W=0, we just need to zero the output.
                                     return 0 as $T;
                                 }
-                                seq_t!(ROW in $T {
-                                    match row {
-                                        #(ROW => {
-                                            const MASK: $T = (1 << (W % <$T>::T)) - 1;
-                                            const START_BIT: usize = ROW * W;
-
-                                            const START_WORD: usize = START_BIT / <$T>::T;
-                                             // bits to shift out of lo word
-                                            const LO_SHIFT: usize = START_BIT % <$T>::T;
-                                            // remaining bits in the lo word == bits to shift from hi word
-                                            const REMAINING_BITS: usize = <$T>::T - LO_SHIFT;
-
-                                            let lo = packed[<$T>::LANES * START_WORD + lane] >> LO_SHIFT;
-                                            return if REMAINING_BITS >= W {
-                                                // in this case we will mask out all bits of hi word
-                                                lo & MASK
-                                            } else {
-                                                // guaranteed that lo_shift > 0 and thus remaining_bits < T
-                                                let hi = packed[<$T>::LANES * (START_WORD + 1) + lane] << REMAINING_BITS;
-                                                (lo | hi) & MASK
-                                            }
-                                        },)*
-                                        _ => unreachable!("Unsupported row: {}", row)
-                                    }
-                                })
+
+                                const MASK: $T = (1 << (W % <$T>::T)) - 1;
+                                let start_bit = row * W;
+                                let start_word = start_bit / <$T>::T;
+                                let lo_shift = start_bit % <$T>::T;
+                                let remaining_bits = <$T>::T - lo_shift;
+
+                                let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift;
+                                return if remaining_bits >= W {
+                                    // in this case we will mask out all bits of hi word
+                                    lo & MASK
+                                } else {
+                                    // guaranteed that lo_shift > 0 and thus remaining_bits < T
+                                    let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits;
+                                    (lo | hi) & MASK
+                                }
                             },)*
                             // seq_t has exclusive upper bound
                             Self::T => {

From 6cd5629257af6da14265f0608b84c7f71d227991 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Thu, 18 Jul 2024 14:40:25 +0100
Subject: [PATCH 11/16] faster, less const magic

---
 src/bitpacking.rs | 69 ++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index f7b60e2..935f469 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -135,17 +135,10 @@ macro_rules! impl_packing {
                 where
                     BitPackWidth<W>: SupportedBitPackWidth<Self>,
                 {
-                    unsafe {
-                        Self::unchecked_unpack_single(W, packed, index)
+                    if W == 0 {
+                        // Special case for W=0, we just need to zero the output.
+                        return 0 as $T;
                     }
-                }
-
-                #[allow(arithmetic_overflow, unused_comparisons)]
-                unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self {
-                    let packed_len = 128 * width / size_of::<Self>();
-                    debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len);
-                    debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);
-                    debug_assert!(index < 1024, "index must be less than or equal to 1024");
 
                     let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 {
                         match index {
@@ -165,34 +158,44 @@ macro_rules! impl_packing {
                         }
                     });
 
+                    if W == Self::T {
+                        // Special case for W==T, we can just read the value directly
+                        return packed[<$T>::LANES * row + lane];
+                    }
+
+                    let mask: $T = (1 << (W % <$T>::T)) - 1;
+                    let start_bit = row * W;
+                    let start_word = start_bit / <$T>::T;
+                    let lo_shift = start_bit % <$T>::T;
+                    let remaining_bits = <$T>::T - lo_shift;
+
+                    let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift;
+                    return if remaining_bits >= W {
+                        // in this case we will mask out all bits of hi word
+                        lo & mask
+                    } else {
+                        // guaranteed that lo_shift > 0 and thus remaining_bits < T
+                        let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits;
+                        (lo | hi) & mask
+                    }
+                }
+
+                unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self {
+                    let packed_len = 128 * width / size_of::<Self>();
+                    debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len);
+                    debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);
+                    debug_assert!(index < 1024, "index must be less than or equal to 1024");
+
+                    const T: usize = <$T>::T;
+
                     seq_t!(W in $T {
                         match width {
                             #(W => {
-                                if W == 0 {
-                                    // Special case for W=0, we just need to zero the output.
-                                    return 0 as $T;
-                                }
-
-                                const MASK: $T = (1 << (W % <$T>::T)) - 1;
-                                let start_bit = row * W;
-                                let start_word = start_bit / <$T>::T;
-                                let lo_shift = start_bit % <$T>::T;
-                                let remaining_bits = <$T>::T - lo_shift;
-
-                                let lo = packed[<$T>::LANES * start_word + lane] >> lo_shift;
-                                return if remaining_bits >= W {
-                                    // in this case we will mask out all bits of hi word
-                                    lo & MASK
-                                } else {
-                                    // guaranteed that lo_shift > 0 and thus remaining_bits < T
-                                    let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits;
-                                    (lo | hi) & MASK
-                                }
+                                return <$T>::unpack_single::<W>(array_ref![packed, 0, 1024 * W / T], index);
                             },)*
                             // seq_t has exclusive upper bound
-                            Self::T => {
-                                // Special case for W=T, we can just read the value directly
-                                return packed[<$T>::LANES * row + lane];
+                            T => {
+                                return <$T>::unpack_single::<T>(array_ref![packed, 0, 1024], index);
                             },
                             _ => unreachable!("Unsupported width: {}", width)
                         }

From d46e9f0925d4bd1d68b5f8770cdd73a1422fd711 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Fri, 19 Jul 2024 13:31:29 +0100
Subject: [PATCH 12/16] sub nanosecond, woot

---
 Cargo.toml        |  1 +
 src/bitpacking.rs | 53 +++++++++++++++++++++++++++--------------------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5e8472b..01a16b2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ edition = "2021"
 
 [dependencies]
 arrayref = "0.3.7"
+const_for = "0.1.4"
 num-traits = "0.2.19"
 paste = "1.0.15"
 seq-macro = "0.3.5"
diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 935f469..73972d3 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -1,7 +1,7 @@
 use arrayref::{array_mut_ref, array_ref};
 use core::mem::size_of;
 use paste::paste;
-use seq_macro::seq;
+use const_for::const_for;
 
 use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
 
@@ -140,25 +140,14 @@ macro_rules! impl_packing {
                         return 0 as $T;
                     }
 
-                    let (lane, row): (usize, usize) = seq!(INDEX in 0..1024 {
-                        match index {
-                            #(INDEX => {
-                                // This calculation of (lane, row) is the inverse of the `index` function from the
-                                // pack/unpack macros
-                                const LANE: usize = INDEX % <$T>::LANES;
-                                const ROW: usize = {
-                                    let s = INDEX / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
-                                    let fl_order = (INDEX - s * 128 - LANE) / 16; // value of FL_ORDER[o]
-                                    let o = FL_ORDER[fl_order]; // because this transposition is invertible!
-                                    o * 8 + s
-                                };
-                                (LANE, ROW)
-                            })*
-                            _ => unreachable!("Unsupported index: {}", index)
-                        }
-                    });
+                    assert!(index < 1024, "Index must be less than 1024, got {}", index);
+                    let (lane, row): (usize, usize) = {
+                        const LANES: [u8; 1024] = lanes_by_index::<$T>();
+                        const ROWS: [u8; 1024] = rows_by_index::<$T>();
+                        (LANES[index] as usize, ROWS[index] as usize)
+                    };
 
-                    if W == Self::T {
+                    if W == <$T>::T {
                         // Special case for W==T, we can just read the value directly
                         return packed[<$T>::LANES * row + lane];
                     }
@@ -177,17 +166,17 @@ macro_rules! impl_packing {
                         // guaranteed that lo_shift > 0 and thus remaining_bits < T
                         let hi = packed[<$T>::LANES * (start_word + 1) + lane] << remaining_bits;
                         (lo | hi) & mask
-                    }
+                    };
                 }
 
                 unsafe fn unchecked_unpack_single(width: usize, packed: &[Self], index: usize) -> Self {
+                    const T: usize = <$T>::T;
+
                     let packed_len = 128 * width / size_of::<Self>();
                     debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len);
                     debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);
                     debug_assert!(index < 1024, "index must be less than or equal to 1024");
 
-                    const T: usize = <$T>::T;
-
                     seq_t!(W in $T {
                         match width {
                             #(W => {
@@ -206,6 +195,26 @@ macro_rules! impl_packing {
     };
 }
 
+const fn lanes_by_index<T: FastLanes>() -> [u8; 1024] {
+    let mut lanes = [0u8; 1024];
+    const_for!(i in 0..1024 => {
+        lanes[i] = (i % T::LANES) as u8;
+    });
+    lanes
+}
+
+const fn rows_by_index<T: FastLanes>() -> [u8; 1024] {
+    let mut rows = [0u8; 1024];
+    const_for!(i in 0..1024 => {
+        let lane = i % T::LANES;
+        let s = i / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
+        let fl_order = (i - s * 128 - lane) / 16; // value of FL_ORDER[o]
+        let o = FL_ORDER[fl_order]; // because this transposition is invertible!
+        rows[i] = (o * 8 + s) as u8;
+    });
+    rows
+}
+
 impl_packing!(u8);
 impl_packing!(u16);
 impl_packing!(u32);

From d63a0e7ac8f848b6b374740d728847602c5254bc Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Fri, 19 Jul 2024 14:05:02 +0100
Subject: [PATCH 13/16] remove vestigial

---
 src/macros.rs | 45 ---------------------------------------------
 1 file changed, 45 deletions(-)

diff --git a/src/macros.rs b/src/macros.rs
index 5bf02c4..e725e42 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -173,51 +173,6 @@ macro_rules! unpack {
     };
 }
 
-#[macro_export]
-macro_rules! unpack_single {
-    // $W must be constant / compile-time known
-    ($T:ty, $W:expr, $packed:expr, $row:expr, $lane:expr) => {{
-        use $crate::{FastLanes, seq_t};
-        use paste::paste;
-
-        // The number of bits of T.
-        const T: usize = <$T>::T;
-
-        if $W == 0 {
-            // Special case for W=0, we just need to zero the output.
-            return 0 as $T;
-        } else if $W == T {
-            return $packed[<$T>::LANES * $row + $lane];
-        }
-
-        paste!(seq_t!(ROW in $T {
-            match $row {
-                #(ROW => {
-                    const MASK: $T = (1 << ($W % T)) - 1;
-                    const START_BIT: usize = ROW * $W;
-
-                    const START_WORD: usize = START_BIT / T;
-                     // bits to shift out of lo word
-                    const LO_SHIFT: usize = START_BIT % T;
-                    // remaining bits in the lo word == bits to shift from hi word
-                    const REMAINING_BITS: usize = T - LO_SHIFT;
-
-                    let lo = packed[<$T>::LANES * START_WORD + $lane] >> LO_SHIFT;
-                    return if REMAINING_BITS >= W {
-                        // in this case we will mask out all bits of hi word
-                        lo & MASK
-                    } else {
-                        // guaranteed that lo_shift > 0 and thus remaining_bits < T
-                        let hi = packed[<$T>::LANES * (START_WORD + 1) + $lane] << REMAINING_BITS;
-                        (lo | hi) & MASK
-                    }
-                },)*
-                _ => unreachable!("Unsupported row: {}", row)
-            }
-        }))
-    }}
-}
-
 #[cfg(test)]
 mod test {
     use crate::{BitPacking, FastLanes};

From b3d54962b0cc81edb20178a1079e62a69d003b24 Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Fri, 19 Jul 2024 14:07:33 +0100
Subject: [PATCH 14/16] comment

---
 src/bitpacking.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 73972d3..f382d3b 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -140,6 +140,17 @@ macro_rules! impl_packing {
                         return 0 as $T;
                     }
 
+                    // We can think of the input array as effectively a row-major, left-to-right
+                    // 2-D array of with `Self::LANES` columns and `Self::T` rows.
+                    //
+                    // Meanwhile, we can think of the packed array as either:
+                    //      1. `Self::T` rows of W-bit elements, with `Self::LANES` columns
+                    //      2. `W` rows of `Self::T`-bit words, with `Self::LANES` columns
+                    //
+                    // Bitpacking involves a transposition of the input array ordering, such that
+                    // decompression can be fused efficiently with encodings like delta and RLE.
+                    //
+                    // First step, we need to get the lane and row for interpretation #1 above.
                     assert!(index < 1024, "Index must be less than 1024, got {}", index);
                     let (lane, row): (usize, usize) = {
                         const LANES: [u8; 1024] = lanes_by_index::<$T>();
@@ -195,6 +206,7 @@ macro_rules! impl_packing {
     };
 }
 
+// helper function executed at compile-time to speed up unpack_single at runtime
 const fn lanes_by_index<T: FastLanes>() -> [u8; 1024] {
     let mut lanes = [0u8; 1024];
     const_for!(i in 0..1024 => {
@@ -203,9 +215,16 @@ const fn lanes_by_index<T: FastLanes>() -> [u8; 1024] {
     lanes
 }
 
+// helper function executed at compile-time to speed up unpack_single at runtime
 const fn rows_by_index<T: FastLanes>() -> [u8; 1024] {
     let mut rows = [0u8; 1024];
     const_for!(i in 0..1024 => {
+        // This is the inverse of the `index` function from the pack/unpack macros:
+        //     fn index(row: usize, lane: usize) -> usize {
+        //         let o = row / 8;
+        //         let s = row % 8;
+        //         (FL_ORDER[o] * 16) + (s * 128) + lane
+        //     }
         let lane = i % T::LANES;
         let s = i / 128; // because `(FL_ORDER[o] * 16) + lane` is always < 128
         let fl_order = (i - s * 128 - lane) / 16; // value of FL_ORDER[o]

From 9363cdbe93e7ad47715b77300d28c4ea98a2450a Mon Sep 17 00:00:00 2001
From: Will Manning <will@willmanning.io>
Date: Fri, 19 Jul 2024 14:20:58 +0100
Subject: [PATCH 15/16] remove unnecessary assert

---
 src/bitpacking.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index f382d3b..5dfe9ab 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -1,7 +1,7 @@
 use arrayref::{array_mut_ref, array_ref};
+use const_for::const_for;
 use core::mem::size_of;
 use paste::paste;
-use const_for::const_for;
 
 use crate::{pack, seq_t, unpack, FastLanes, Pred, Satisfied, FL_ORDER};
 
@@ -186,7 +186,6 @@ macro_rules! impl_packing {
                     let packed_len = 128 * width / size_of::<Self>();
                     debug_assert_eq!(packed.len(), packed_len, "Input buffer must be of size {}", packed_len);
                     debug_assert!(width <= Self::T, "Width must be less than or equal to {}", Self::T);
-                    debug_assert!(index < 1024, "index must be less than or equal to 1024");
 
                     seq_t!(W in $T {
                         match width {

From f819899eae1e046a77fab3c91d03f6e9ca179b21 Mon Sep 17 00:00:00 2001
From: Nicholas Gates <nick@nickgates.com>
Date: Fri, 19 Jul 2024 15:00:18 +0100
Subject: [PATCH 16/16] Remove inline never

---
 src/bitpacking.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/bitpacking.rs b/src/bitpacking.rs
index 5dfe9ab..8d0ca10 100644
--- a/src/bitpacking.rs
+++ b/src/bitpacking.rs
@@ -62,7 +62,6 @@ macro_rules! impl_packing {
     ($T:ty) => {
         paste! {
             impl BitPacking for $T {
-                #[inline(never)] // Makes it easier to disassemble and validate ASM.
                 fn pack<const W: usize>(
                     input: &[Self; 1024],
                     output: &mut [Self; 1024 * W / Self::T],
@@ -96,7 +95,6 @@ macro_rules! impl_packing {
                     })
                 }
 
-                #[inline(never)]
                 fn unpack<const W: usize>(
                     input: &[Self; 1024 * W / Self::T],
                     output: &mut [Self; 1024],