From 10ff5a0824a800d2df688894c904ebfc1b2a2dce Mon Sep 17 00:00:00 2001 From: Kornel Date: Mon, 29 Jan 2024 13:29:27 +0000 Subject: [PATCH] Optimize match_len == 3 (#146) * Add throughput to benches * Trim debug/error fmt * Amortize bounds checks in apply_match fast path * Avoid panic in tree_lookup --- benches/bench.rs | 2 ++ miniz_oxide/src/inflate/core.rs | 30 ++++++++++++++++++++++++------ miniz_oxide/src/inflate/mod.rs | 3 ++- src/lib_oxide.rs | 3 ++- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index d30deaa1..c71e9fa2 100755 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -50,6 +50,7 @@ macro_rules! decompress_bench { let compressed = compress_to_vec(input.as_slice(), $level); let mut out_len: usize = 0; + b.bytes = input.len() as _; b.iter(|| unsafe { w($decompress_func( compressed.as_ptr() as *mut c_void, @@ -70,6 +71,7 @@ macro_rules! compress_bench { let mut out_len: usize = 0; let flags = create_comp_flags_from_zip_params($level, -15, 0) as i32; + b.bytes = input.len() as _; b.iter(|| unsafe { w($compress_func( input.as_ptr() as *mut c_void, diff --git a/miniz_oxide/src/inflate/core.rs b/miniz_oxide/src/inflate/core.rs index e1521395..bd29cd3a 100644 --- a/miniz_oxide/src/inflate/core.rs +++ b/miniz_oxide/src/inflate/core.rs @@ -2,6 +2,7 @@ use super::*; use crate::shared::{update_adler32, HUFFMAN_LENGTH_ORDER}; +use ::core::cell::Cell; use ::core::convert::TryInto; use ::core::{cmp, slice}; @@ -52,7 +53,12 @@ impl HuffmanTable { loop { // symbol here indicates the position of the left (0) node, if the next bit is 1 // we add 1 to the lookup position to get the right node. - symbol = i32::from(self.tree[(!symbol + ((bit_buf >> code_len) & 1) as i32) as usize]); + let tree_index = (!symbol + ((bit_buf >> code_len) & 1) as i32) as usize; + debug_assert!(tree_index < self.tree.len()); + if tree_index >= self.tree.len() { + break; + } + symbol = i32::from(self.tree[tree_index]); code_len += 1; if symbol >= 0 { break; @@ -896,15 +902,27 @@ fn apply_match( match_len: usize, out_buf_size_mask: usize, ) { - debug_assert!(out_pos + match_len <= out_slice.len()); + debug_assert!(out_pos.checked_add(match_len).unwrap() <= out_slice.len()); let source_pos = out_pos.wrapping_sub(dist) & out_buf_size_mask; if match_len == 3 { - // Fast path for match len 3. - out_slice[out_pos] = out_slice[source_pos]; - out_slice[out_pos + 1] = out_slice[(source_pos + 1) & out_buf_size_mask]; - out_slice[out_pos + 2] = out_slice[(source_pos + 2) & out_buf_size_mask]; + let out_slice = Cell::from_mut(out_slice).as_slice_of_cells(); + if let Some(dst) = out_slice.get(out_pos..out_pos + 3) { + // Moving bounds checks before any memory mutation allows the optimizer + // combine them together. + let src = out_slice + .get(source_pos) + .zip(out_slice.get((source_pos + 1) & out_buf_size_mask)) + .zip(out_slice.get((source_pos + 2) & out_buf_size_mask)); + if let Some(((a, b), c)) = src { + // For correctness, the memory reads and writes have to be interleaved. + // Cells make it possible for read and write references to overlap. + dst[0].set(a.get()); + dst[1].set(b.get()); + dst[2].set(c.get()); + } + } return; } diff --git a/miniz_oxide/src/inflate/mod.rs b/miniz_oxide/src/inflate/mod.rs index 4ddc7441..a4fa5cf8 100644 --- a/miniz_oxide/src/inflate/mod.rs +++ b/miniz_oxide/src/inflate/mod.rs @@ -90,13 +90,14 @@ pub struct DecompressError { #[cfg(feature = "with-alloc")] impl alloc::fmt::Display for DecompressError { + #[cold] fn fmt(&self, f: &mut ::core::fmt::Formatter<'_>) -> ::core::fmt::Result { f.write_str(match self.status { TINFLStatus::FailedCannotMakeProgress => "Truncated input stream", TINFLStatus::BadParam => "Invalid output buffer size", TINFLStatus::Adler32Mismatch => "Adler32 checksum mismatch", TINFLStatus::Failed => "Invalid input data", - TINFLStatus::Done => unreachable!(), + TINFLStatus::Done => "", // Unreachable TINFLStatus::NeedsMoreInput => "Truncated input stream", TINFLStatus::HasMoreOutput => "Output size exceeded the specified limit", }) diff --git a/src/lib_oxide.rs b/src/lib_oxide.rs index 8b0dc2a5..6473d5a5 100644 --- a/src/lib_oxide.rs +++ b/src/lib_oxide.rs @@ -24,12 +24,13 @@ pub enum InternalState { } impl fmt::Debug for InternalState { + #[cold] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let name = match &self { InternalState::Inflate(_) => "Decompressor", InternalState::Deflate(_) => "Compressor", }; - write!(f, "{}", name) + f.write_str(name) } }