Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local stream state #101

Merged
merged 6 commits into from
Oct 19, 2024
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 138 additions & 67 deletions src/structs/vpx_bool_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ use crate::metrics::{Metrics, ModelComponent};
use super::{branch::Branch, simple_hash::SimpleHash};

const BITS_IN_BYTE: i32 = 8;
const BITS_IN_VALUE: i32 = 32;
const BITS_IN_VALUE: i32 = 64;
const BITS_IN_VALUE_MINUS_LAST_BYTE: i32 = BITS_IN_VALUE - BITS_IN_BYTE;

pub struct VPXBoolReader<R> {
value: u32,
range: u32, // 128 << BITS_IN_VALUE_MINUS_LAST_BYTE <= range <= 255 << BITS_IN_VALUE_MINUS_LAST_BYTE
value: u64,
range: u64, // 128 << BITS_IN_VALUE_MINUS_LAST_BYTE <= range <= 255 << BITS_IN_VALUE_MINUS_LAST_BYTE
count: i32,
upstream_reader: R,
model_statistics: Metrics,
Expand Down Expand Up @@ -65,46 +65,149 @@ impl<R: Read> VPXBoolReader<R> {
self.model_statistics.drain()
}

#[inline(always)]
pub fn get_bit(
branch: &mut Branch,
tmp_value: &mut u64,
tmp_range: &mut u64,
tmp_count: &mut i32,
) -> bool {
let probability = branch.get_probability() as u64;

let split = ((((*tmp_range - (1 << BITS_IN_VALUE_MINUS_LAST_BYTE)) >> 8) * probability)
& (0xFF << BITS_IN_VALUE_MINUS_LAST_BYTE))
+ (1 << BITS_IN_VALUE_MINUS_LAST_BYTE);

// So optimizer understands that 0 should never happen and uses a cold jump
// if we don't have LZCNT on x86 CPUs (older BSR instruction requires check for zero).
// This is better since the branch prediction figures quickly this never happens and can run
// the code sequentially.
#[cfg(all(
not(target_feature = "lzcnt"),
any(target_arch = "x86", target_arch = "x86_64")
))]
assert!(*tmp_range - split > 0);

let bit = *tmp_value >= split;

branch.record_and_update_bit(bit);

if bit {
*tmp_range -= split;
*tmp_value -= split;
} else {
*tmp_range = split;
}

let shift = (*tmp_range).leading_zeros() as i32;

*tmp_value <<= shift;
*tmp_range <<= shift;
*tmp_count -= shift;

#[cfg(feature = "compression_stats")]
{
self.model_statistics
.record_compression_stats(_cmp, 1, i64::from(shift));
}

#[cfg(feature = "detailed_tracing")]
{
self.hash.hash(branch.get_u64());
self.hash.hash(tmp_value);
self.hash.hash(tmp_count);
self.hash.hash(tmp_range);

//if hash == 0x88f9c945
{
let hash = self.hash.get();

print!("({0}:{1:x})", bit as u8, hash);
if hash % 8 == 0 {
println!();
}
}
}

bit
}

#[inline(never)]
pub fn get_grid<const A: usize>(
&mut self,
branches: &mut [Branch; A],
cmp: ModelComponent,
_cmp: ModelComponent,
) -> Result<usize> {
// check if A is a power of 2
assert!((A & (A - 1)) == 0);

let mut tmp_value = self.value;
let mut tmp_range = self.range;
let mut tmp_count = self.count;

let mut decoded_so_far = 1;

for _index in 0..A.ilog2() {
let cur_bit = self.get(&mut branches[decoded_so_far], cmp)? as usize;
for index in 0..A.ilog2() {
// we can read only each 8-th iteration: minimum 56 bits are in value after vpx_reader_fill,
// and one get_bit consumes at most 7 bits (with range coming from >127 to 1)
if index & 7 == 0 {
Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
}

let cur_bit = Self::get_bit(
&mut branches[decoded_so_far],
&mut tmp_value,
&mut tmp_range,
&mut tmp_count,
) as usize;
decoded_so_far <<= 1;
decoded_so_far |= cur_bit;
}

// remove set leading bit
let value = decoded_so_far ^ A;

self.value = tmp_value;
self.range = tmp_range;
self.count = tmp_count;

Ok(value)
}

#[inline(never)]
pub fn get_unary_encoded<const A: usize>(
&mut self,
branches: &mut [Branch; A],
cmp: ModelComponent,
_cmp: ModelComponent,
) -> Result<usize> {
let mut tmp_value = self.value;
let mut tmp_range = self.range;
let mut tmp_count = self.count;

let mut value = 0;

while value != A {
let cur_bit = self.get(&mut branches[value], cmp)?;
if value & 7 == 0 {
Melirius marked this conversation as resolved.
Show resolved Hide resolved
Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
}

let cur_bit = Self::get_bit(
&mut branches[value],
&mut tmp_value,
&mut tmp_range,
&mut tmp_count,
);
if !cur_bit {
break;
}

value += 1;
}

self.value = tmp_value;
self.range = tmp_range;
self.count = tmp_count;

return Ok(value);
}

Expand All @@ -113,15 +216,34 @@ impl<R: Read> VPXBoolReader<R> {
&mut self,
n: usize,
branches: &mut [Branch; A],
cmp: ModelComponent,
_cmp: ModelComponent,
) -> Result<usize> {
assert!(n <= branches.len());

let mut tmp_value = self.value;
let mut tmp_range = self.range;
let mut tmp_count = self.count;

let mut coef = 0;
for i in (0..n).rev() {
coef |= (self.get(&mut branches[i], cmp)? as usize) << i;
// here the fastest way is to use this condition
if tmp_count < 0 {
Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
}

coef |= (Self::get_bit(
&mut branches[i],
&mut tmp_value,
&mut tmp_range,
&mut tmp_count,
) as usize)
<< i;
}

self.value = tmp_value;
self.range = tmp_range;
self.count = tmp_count;

return Ok(coef);
}

Expand Down Expand Up @@ -156,70 +278,19 @@ impl<R: Read> VPXBoolReader<R> {
Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
}

let probability = branch.get_probability() as u32;

let split = ((((tmp_range - (1 << BITS_IN_VALUE_MINUS_LAST_BYTE)) >> 8) * probability)
& (0xFF << BITS_IN_VALUE_MINUS_LAST_BYTE))
+ (1 << BITS_IN_VALUE_MINUS_LAST_BYTE);

// So optimizer understands that 0 should never happen and uses a cold jump
// if we don't have LZCNT on x86 CPUs (older BSR instruction requires check for zero).
// This is better since the branch prediction figures quickly this never happens and can run
// the code sequentially.
#[cfg(all(
not(target_feature = "lzcnt"),
any(target_arch = "x86", target_arch = "x86_64")
))]
assert!(tmp_range - split > 0);

let bit = tmp_value >= split;

branch.record_and_update_bit(bit);

if bit {
tmp_range -= split;
tmp_value -= split;
} else {
tmp_range = split;
}

let shift = tmp_range.leading_zeros() as i32;

self.value = tmp_value << shift;
self.range = tmp_range << shift;
self.count = tmp_count - shift;
let bit = Self::get_bit(branch, &mut tmp_value, &mut tmp_range, &mut tmp_count);

#[cfg(feature = "compression_stats")]
{
self.model_statistics
.record_compression_stats(_cmp, 1, i64::from(shift));
}

#[cfg(feature = "detailed_tracing")]
{
self.hash.hash(branch.get_u64());
self.hash.hash(self.value);
self.hash.hash(self.count);
self.hash.hash(self.range);

//if hash == 0x88f9c945
{
let hash = self.hash.get();

print!("({0}:{1:x})", bit as u8, hash);
if hash % 8 == 0 {
println!();
}
}
}
self.value = tmp_value;
self.range = tmp_range;
self.count = tmp_count;

return Ok(bit);
}

#[cold]
#[inline(always)]
fn vpx_reader_fill(
tmp_value: &mut u32,
tmp_value: &mut u64,
tmp_count: &mut i32,
upstream_reader: &mut R,
) -> Result<()> {
Expand All @@ -233,7 +304,7 @@ impl<R: Read> VPXBoolReader<R> {
break;
}

*tmp_value |= (v[0] as u32) << shift;
*tmp_value |= (v[0] as u64) << shift;
shift -= BITS_IN_BYTE;
*tmp_count += BITS_IN_BYTE;
}
Expand Down
Loading