From 4460093d93fe4f12a11fda8a6984e77b547e14bd Mon Sep 17 00:00:00 2001 From: Kristof Roomp Date: Fri, 20 Dec 2024 23:14:55 +0100 Subject: [PATCH] Optimize byte flushing in bool_writer (#130) * clean up boolwriter a bit and make it slightly faster * comments --- src/helpers.rs | 8 ++++++ src/structs/vpx_bool_writer.rs | 51 ++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/helpers.rs b/src/helpers.rs index 8e6c80e..871b54a 100644 --- a/src/helpers.rs +++ b/src/helpers.rs @@ -161,6 +161,14 @@ pub fn calc_sign_index(val: i16) -> usize { } } +/// This checks to see if a vector can fit additional elements without growing, +/// but does it in such a way that the optimizer understands that a subsequent +/// push or extend will not need to grow the vector. +#[inline(always)] +pub fn needs_to_grow(v: &Vec, additional: usize) -> bool { + additional > v.capacity().wrapping_sub(v.len()) +} + #[cfg(test)] pub fn get_rand_from_seed(seed: [u8; 32]) -> rand_chacha::ChaCha12Rng { use rand_chacha::rand_core::SeedableRng; diff --git a/src/structs/vpx_bool_writer.rs b/src/structs/vpx_bool_writer.rs index 6ac7aa4..930e05c 100644 --- a/src/structs/vpx_bool_writer.rs +++ b/src/structs/vpx_bool_writer.rs @@ -24,6 +24,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” use std::io::{Result, Write}; +use crate::helpers::needs_to_grow; use crate::metrics::{Metrics, ModelComponent}; use crate::structs::branch::Branch; use crate::structs::simple_hash::SimpleHash; @@ -112,23 +113,39 @@ impl VPXBoolWriter { // check whether we cannot put next bit into stream if tmp_value & (u64::MAX << 57) != 0 { - let mut stream_bits = 64 - tmp_value.leading_zeros() - 2; - // 62 >= stream_bits >= 56 - - if tmp_value & (1 << stream_bits) != 0 { + // calculate the number odd bits left over after we remove: + // - 48 bits (6 bytes) flushed to buffer + // - 8 bits need to keep for coding accuracy (since probability resolution is 8 bits) + // - 1 bit for marker + // - 1 bit for overflow + // + // leftover_bits will always be <= 8 + let leftover_bits = tmp_value.leading_zeros() + 2; + + // shift align so that the top 6 bytes are ones we want to write, if there + // was an overflow it gets rotated down to the bottom bit + let v_aligned = tmp_value.rotate_left(leftover_bits); + + if (v_aligned & 1) != 0 { self.carry(); } - for _stream_bytes in 0..6 { - stream_bits -= 8; - self.buffer.push((tmp_value >> stream_bits) as u8); + // Append the top six bytes of the u64 into buffer in big endian so that the top byte goes first. + if needs_to_grow(&self.buffer, 8) { + // avoid inlining slow path to allocate more memory that happens almost never + put_6bytes(&mut self.buffer, v_aligned); + } else { + // Faster to add all 8 and then shrink the buffer than add 6 that creates a temporary buffer. + let b = v_aligned.to_be_bytes(); + self.buffer.extend_from_slice(&b); + self.buffer.truncate(self.buffer.len() - 2); } - tmp_value &= (1 << stream_bits) - 1; - tmp_value |= 1 << (stream_bits + 1); - // 14 >= stream_bits >= 8 + // mask the remaining bits (between 8 and 16) and put them back to where they were + // adding the marker bit to the top + tmp_value = ((v_aligned & 0xffff) | 0x20000/*marker bit*/) >> leftover_bits; } - // 55 >= stream_bits >= 8 + (tmp_value, tmp_range) } @@ -136,7 +153,10 @@ impl VPXBoolWriter { /// of the first stream byte - then `carry` cannot be invoked on empty `buffer`, /// and after the stream beginning `flush_non_final_data` keeps carry-terminating /// byte sequence (one non-255-byte before any number of 255-bytes) inside the `buffer`. - #[inline(always)] + /// + /// Cold to keep this out of the inner loop since carries are pretty rare + #[cold] + #[inline(never)] fn carry(&mut self) { let mut x = self.buffer.len() - 1; @@ -310,6 +330,13 @@ impl VPXBoolWriter { } } +#[cold] +#[inline(never)] +fn put_6bytes(buffer: &mut Vec, v: u64) { + let b = v.to_be_bytes(); + buffer.extend_from_slice(b[0..6].as_ref()); +} + #[cfg(test)] use crate::structs::vpx_bool_reader::VPXBoolReader;