microsoft · mcroomp · Oct 19, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/src/structs/vpx_bool_reader.rs b/src/structs/vpx_bool_reader.rs
@@ -29,12 +29,12 @@ use crate::metrics::{Metrics, ModelComponent};
 use super::{branch::Branch, simple_hash::SimpleHash};
 
 const BITS_IN_BYTE: i32 = 8;
-const BITS_IN_VALUE: i32 = 32;
+const BITS_IN_VALUE: i32 = 64;
 const BITS_IN_VALUE_MINUS_LAST_BYTE: i32 = BITS_IN_VALUE - BITS_IN_BYTE;
 
 pub struct VPXBoolReader<R> {
-    value: u32,
-    range: u32, // 128 << BITS_IN_VALUE_MINUS_LAST_BYTE <= range <= 255 << BITS_IN_VALUE_MINUS_LAST_BYTE
+    value: u64,
+    range: u64, // 128 << BITS_IN_VALUE_MINUS_LAST_BYTE <= range <= 255 << BITS_IN_VALUE_MINUS_LAST_BYTE
     count: i32,
     upstream_reader: R,
     model_statistics: Metrics,
@@ -65,46 +65,149 @@ impl<R: Read> VPXBoolReader<R> {
         self.model_statistics.drain()
     }
 
+    #[inline(always)]
+    pub fn get_bit(
+        branch: &mut Branch,
+        tmp_value: &mut u64,
+        tmp_range: &mut u64,
+        tmp_count: &mut i32,
+    ) -> bool {
+        let probability = branch.get_probability() as u64;
+
+        let split = ((((*tmp_range - (1 << BITS_IN_VALUE_MINUS_LAST_BYTE)) >> 8) * probability)
+            & (0xFF << BITS_IN_VALUE_MINUS_LAST_BYTE))
+            + (1 << BITS_IN_VALUE_MINUS_LAST_BYTE);
+
+        // So optimizer understands that 0 should never happen and uses a cold jump
+        // if we don't have LZCNT on x86 CPUs (older BSR instruction requires check for zero).
+        // This is better since the branch prediction figures quickly this never happens and can run
+        // the code sequentially.
+        #[cfg(all(
+            not(target_feature = "lzcnt"),
+            any(target_arch = "x86", target_arch = "x86_64")
+        ))]
+        assert!(*tmp_range - split > 0);
+
+        let bit = *tmp_value >= split;
+
+        branch.record_and_update_bit(bit);
+
+        if bit {
+            *tmp_range -= split;
+            *tmp_value -= split;
+        } else {
+            *tmp_range = split;
+        }
+
+        let shift = (*tmp_range).leading_zeros() as i32;
+
+        *tmp_value <<= shift;
+        *tmp_range <<= shift;
+        *tmp_count -= shift;
+
+        #[cfg(feature = "compression_stats")]
+        {
+            self.model_statistics
+                .record_compression_stats(_cmp, 1, i64::from(shift));
+        }
+
+        #[cfg(feature = "detailed_tracing")]
+        {
+            self.hash.hash(branch.get_u64());
+            self.hash.hash(tmp_value);
+            self.hash.hash(tmp_count);
+            self.hash.hash(tmp_range);
+
+            //if hash == 0x88f9c945
+            {
+                let hash = self.hash.get();
+
+                print!("({0}:{1:x})", bit as u8, hash);
+                if hash % 8 == 0 {
+                    println!();
+                }
+            }
+        }
+
+        bit
+    }
+
     #[inline(never)]
     pub fn get_grid<const A: usize>(
         &mut self,
         branches: &mut [Branch; A],
-        cmp: ModelComponent,
+        _cmp: ModelComponent,
     ) -> Result<usize> {
         // check if A is a power of 2
         assert!((A & (A - 1)) == 0);
 
+        let mut tmp_value = self.value;
+        let mut tmp_range = self.range;
+        let mut tmp_count = self.count;
+
         let mut decoded_so_far = 1;
 
-        for _index in 0..A.ilog2() {
-            let cur_bit = self.get(&mut branches[decoded_so_far], cmp)? as usize;
+        for index in 0..A.ilog2() {
+            // we can read only each 8-th iteration: minimum 56 bits are in value after vpx_reader_fill,
+            // and one get_bit consumes at most 7 bits (with range coming from >127 to 1)
+            if index & 7 == 0 {
+                Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
+            }
+
+            let cur_bit = Self::get_bit(
+                &mut branches[decoded_so_far],
+                &mut tmp_value,
+                &mut tmp_range,
+                &mut tmp_count,
+            ) as usize;
             decoded_so_far <<= 1;
             decoded_so_far |= cur_bit;
         }
 
         // remove set leading bit
         let value = decoded_so_far ^ A;
 
+        self.value = tmp_value;
+        self.range = tmp_range;
+        self.count = tmp_count;
+
         Ok(value)
     }
 
     #[inline(never)]
     pub fn get_unary_encoded<const A: usize>(
         &mut self,
         branches: &mut [Branch; A],
-        cmp: ModelComponent,
+        _cmp: ModelComponent,
     ) -> Result<usize> {
+        let mut tmp_value = self.value;
+        let mut tmp_range = self.range;
+        let mut tmp_count = self.count;
+
         let mut value = 0;
 
         while value != A {
-            let cur_bit = self.get(&mut branches[value], cmp)?;
+            if value & 7 == 0 {
+                Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
+            }
+
+            let cur_bit = Self::get_bit(
+                &mut branches[value],
+                &mut tmp_value,
+                &mut tmp_range,
+                &mut tmp_count,
+            );
             if !cur_bit {
                 break;
             }
 
             value += 1;
         }
 
+        self.value = tmp_value;
+        self.range = tmp_range;
+        self.count = tmp_count;
+
         return Ok(value);
     }
 
@@ -113,15 +216,34 @@ impl<R: Read> VPXBoolReader<R> {
         &mut self,
         n: usize,
         branches: &mut [Branch; A],
-        cmp: ModelComponent,
+        _cmp: ModelComponent,
     ) -> Result<usize> {
         assert!(n <= branches.len());
 
+        let mut tmp_value = self.value;
+        let mut tmp_range = self.range;
+        let mut tmp_count = self.count;
+
         let mut coef = 0;
         for i in (0..n).rev() {
-            coef |= (self.get(&mut branches[i], cmp)? as usize) << i;
+            // here the fastest way is to use this condition
+            if tmp_count < 0 {
+                Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
+            }
+
+            coef |= (Self::get_bit(
+                &mut branches[i],
+                &mut tmp_value,
+                &mut tmp_range,
+                &mut tmp_count,
+            ) as usize)
+                << i;
         }
 
+        self.value = tmp_value;
+        self.range = tmp_range;
+        self.count = tmp_count;
+
         return Ok(coef);
     }
 
@@ -156,70 +278,19 @@ impl<R: Read> VPXBoolReader<R> {
             Self::vpx_reader_fill(&mut tmp_value, &mut tmp_count, &mut self.upstream_reader)?;
         }
 
-        let probability = branch.get_probability() as u32;
-
-        let split = ((((tmp_range - (1 << BITS_IN_VALUE_MINUS_LAST_BYTE)) >> 8) * probability)
-            & (0xFF << BITS_IN_VALUE_MINUS_LAST_BYTE))
-            + (1 << BITS_IN_VALUE_MINUS_LAST_BYTE);
-
-        // So optimizer understands that 0 should never happen and uses a cold jump
-        // if we don't have LZCNT on x86 CPUs (older BSR instruction requires check for zero).
-        // This is better since the branch prediction figures quickly this never happens and can run
-        // the code sequentially.
-        #[cfg(all(
-            not(target_feature = "lzcnt"),
-            any(target_arch = "x86", target_arch = "x86_64")
-        ))]
-        assert!(tmp_range - split > 0);
-
-        let bit = tmp_value >= split;
-
-        branch.record_and_update_bit(bit);
-
-        if bit {
-            tmp_range -= split;
-            tmp_value -= split;
-        } else {
-            tmp_range = split;
-        }
-
-        let shift = tmp_range.leading_zeros() as i32;
-
-        self.value = tmp_value << shift;
-        self.range = tmp_range << shift;
-        self.count = tmp_count - shift;
+        let bit = Self::get_bit(branch, &mut tmp_value, &mut tmp_range, &mut tmp_count);
 
-        #[cfg(feature = "compression_stats")]
-        {
-            self.model_statistics
-                .record_compression_stats(_cmp, 1, i64::from(shift));
-        }
-
-        #[cfg(feature = "detailed_tracing")]
-        {
-            self.hash.hash(branch.get_u64());
-            self.hash.hash(self.value);
-            self.hash.hash(self.count);
-            self.hash.hash(self.range);
-
-            //if hash == 0x88f9c945
-            {
-                let hash = self.hash.get();
-
-                print!("({0}:{1:x})", bit as u8, hash);
-                if hash % 8 == 0 {
-                    println!();
-                }
-            }
-        }
+        self.value = tmp_value;
+        self.range = tmp_range;
+        self.count = tmp_count;
 
         return Ok(bit);
     }
 
     #[cold]
     #[inline(always)]
     fn vpx_reader_fill(
-        tmp_value: &mut u32,
+        tmp_value: &mut u64,
         tmp_count: &mut i32,
         upstream_reader: &mut R,
     ) -> Result<()> {
@@ -233,7 +304,7 @@ impl<R: Read> VPXBoolReader<R> {
                 break;
             }
 
-            *tmp_value |= (v[0] as u32) << shift;
+            *tmp_value |= (v[0] as u64) << shift;
             shift -= BITS_IN_BYTE;
             *tmp_count += BITS_IN_BYTE;
         }