diff: apply heuristics borrowed from GNU diff for "good enough"

This change adds some checks to decide that the search for the best place to split the diffing process has gone for too long, or long enough while finding a good chunk of matches. They are based on similar heuristics that GNU diff applies and will help in cases in which files are very long and have few common sequences. This brings comparing some large files (~36MB) that are very different from ~1 hour to ~8 seconds, but it will still hit some pathological cases, such as some very large cpp files I created for some benchmarking that still take 1 minute. Benchmark 1: diff test-data/huge-base test-data/huge-very-different Time (mean ± σ): 2.790 s ± 0.005 s [User: 2.714 s, System: 0.063 s] Range (min … max): 2.781 s … 2.798 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 2: ./target/release/diffutils.no-heuristics diff test-data/huge-base test-data/huge-very-different Time (mean ± σ): 4755.084 s ± 172.607 s [User: 4727.169 s, System: 0.330 s] Range (min … max): 4607.522 s … 5121.135 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 3: ./target/release/diffutils diff test-data/huge-base test-data/huge-very-different Time (mean ± σ): 7.197 s ± 0.099 s [User: 7.055 s, System: 0.094 s] Range (min … max): 7.143 s … 7.416 s 10 runs Warning: Ignoring non-zero exit code. Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet system without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options. Summary diff test-data/huge-base test-data/huge-very-different ran 2.58 ± 0.04 times faster than ./target/release/diffutils diff test-data/huge-base test-data/huge-very-different 1704.04 ± 61.93 times faster than ./target/release/diffutils.no-heuristics diff test-data/huge-base test-data/huge-very-different Note that the worse that should happen by heuristics causing the search to end early is a suboptimal diff, but the diff will still be correct and usable with patch.
uutils · Nov 1, 2024 · ed1bbfa · ed1bbfa
1 parent 74d2fad
commit ed1bbfa
Showing 1 changed file with 50 additions and 1 deletion.
diff --git a/src/engine.rs b/src/engine.rs
@@ -16,6 +16,11 @@ struct Snake {
 }
 
 impl Snake {
+    fn is_good(&self) -> bool {
+        // This magic number comes from GNU diff.
+        self.length > 20
+    }
+
     fn maybe_update(&mut self, x: isize, y: isize, length: isize) {
         let length = length as usize;
         if length > self.length {
@@ -82,11 +87,26 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
         x >= offset && y >= offset && x < left_length + offset && y < right_length + offset
     };
 
+    // This constant is the value used by GNU diff; using it should give us
+    // more similar diffs.
+    const HIGH_COST: isize = 200;
+
+    // This magic number was borrowed from GNU diff - apparently this is a
+    // good number for modern CPUs.
+    let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
+    info!(too_expensive = too_expensive);
+
     let mut best_snake = Snake::default();
 
     let forward_span = tracing::span!(Level::TRACE, "forward");
     let backward_span = tracing::span!(Level::TRACE, "backward");
-    'outer: for _ in 1..max_cost {
+    'outer: for c in 1..max_cost {
+        info!(c = c, snake_length = best_snake.length);
+        // The files appear to be large and too different. Go for good enough
+        if c > too_expensive {
+            break 'outer;
+        }
+
         // Forwards search
         forward_diagonals.expand_search();
         let fwd = forward_span.enter();
@@ -192,7 +212,21 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
             }
         }
         drop(bwd);
+
+        if c > HIGH_COST && best_snake.is_good() {
+            info!("met criteria for high cost with good snake heuristic");
+            break 'outer;
+        }
+    }
+
+    // If we hit this condition, the search ran too long and found 0 matches.
+    // Get the best we can do as a split point - furthest diagonal.
+    if best_snake.length == 0 {
+        let (x, y) = forward_diagonals.get_furthest_progress();
+        best_snake.x = x;
+        best_snake.y = y;
     }
+
     info!(
         x = best_snake.x,
         y = best_snake.y,
@@ -355,6 +389,21 @@ impl Diagonals {
         actual >= 0 && (actual as usize) < self.data.len()
     }
 
+    fn get_furthest_progress(&self) -> (usize, usize) {
+        let (d, x) = self
+            .data
+            .iter()
+            .enumerate()
+            .filter(|(d, &x)| x - (*d as isize) >= 0)
+            .max_by_key(|(_, &x)| x)
+            .map(|(i, x)| (i as isize, *x))
+            .unwrap_or((0isize, 0isize));
+        let y = x - d;
+        debug_assert!(x >= 0);
+        debug_assert!(y >= 0);
+        (x as usize, y as usize)
+    }
+
     fn expand_search(&mut self) {
         let upper = if *self.search_range.end() == self.max_diag {
             self.search_range.end() - 1