diff: track total cost of search and bail if high

This is the last piece of the puzzle to get somewhat comparable to GNU diff performance without implementing all of its tricks - although this one is also used by GNU diff, in its own way. It brings down a diff which still takes over a minute with the previous commit to under a second. Benchmark 1: diff test-data/b.cpp test-data/c.cpp Time (mean ± σ): 2.533 s ± 0.011 s [User: 2.494 s, System: 0.027 s] Range (min … max): 2.519 s … 2.553 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 2: ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp Time (mean ± σ): 65.798 s ± 1.080 s [User: 65.367 s, System: 0.053 s] Range (min … max): 64.962 s … 68.137 s 10 runs Warning: Ignoring non-zero exit code. Benchmark 3: ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp Time (mean ± σ): 580.6 ms ± 6.5 ms [User: 521.9 ms, System: 38.8 ms] Range (min … max): 570.7 ms … 589.6 ms 10 runs Warning: Ignoring non-zero exit code. Summary ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp ran 4.36 ± 0.05 times faster than diff test-data/b.cpp test-data/c.cpp 113.33 ± 2.26 times faster than ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp It basically keeps track of how much work we have done overall for a diff job and enables giving up completely on trying to find ideal split points if the cost implies we had to trigger the "too expensive" heuristic too often. From that point forward it only does naive splitting of the work. This should not generate diffs which are much worse than doing the diagonal search, as it should only trigger in cases in which the files are so different it won't find good split points anyway. This is another case in which GNU diff's additional work with hashing and splitting large chunks of inclusion / deletion from the diff work and trying harder to find ideal splits seem to cause it to perform slightly poorer: That said, GNU diff probably still generates better diffs not due to this, but due to its post-processing of the results, trying to create more hunks with nearby changes staying close to each other, which we do not do (but we didn't do that before anyway).
uutils · Nov 1, 2024 · 79e0bcc · 79e0bcc
1 parent ed1bbfa
commit 79e0bcc
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 13 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -22,6 +22,7 @@ same-file = "1.0.6"
 unicode-width = "0.2.0"
 tracing = "0.1.40"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+rand = "0.8.5"
 
 [dev-dependencies]
 pretty_assertions = "1.4.0"

diff --git a/src/engine.rs b/src/engine.rs
@@ -6,6 +6,7 @@
 use std::fmt::Debug;
 use std::ops::{Index, IndexMut, RangeInclusive};
 
+use rand::Rng as _;
 use tracing::{info, instrument, trace, Level};
 
 #[derive(Debug, Default, PartialEq)]
@@ -43,12 +44,44 @@ impl Snake {
 fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
     left: &[T],
     right: &[T],
+    total_cost: &mut usize,
 ) -> Snake {
     let left_length = left.len() as isize;
     let right_length = right.len() as isize;
 
     let max_cost = left_length + right_length;
 
+    // This constant is the value used by GNU diff; using it should give us
+    // more similar diffs.
+    const HIGH_COST: isize = 200;
+
+    // This magic number was borrowed from GNU diff - apparently this is a
+    // good number for modern CPUs.
+    let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
+    info!(too_expensive = too_expensive);
+
+    // We've been constantly hitting the too expensive heuristic, this means the
+    // files are too different for us to get a good diff in reasonable amount of
+    // time. Do naive splits from now on.
+    if *total_cost as isize > too_expensive * 10 {
+        info!(
+            total_cost = total_cost,
+            "hit too costly overall heuristic, creating naive split"
+        );
+        let mut rng = rand::thread_rng();
+        let x = if left_length == 0 {
+            0
+        } else {
+            rng.gen_range(0..left.len())
+        };
+        let y = if right_length == 0 {
+            0
+        } else {
+            rng.gen_range(0..right.len())
+        };
+        return Snake { x, y, length: 0 };
+    }
+
     // For collections of different sizes, the diagonals will not neatly balance. That means the
     // "middle" diagonal for the backwards search will be offset from the forward one, so we need
     // to keep track of that so we start at the right point.
@@ -87,20 +120,13 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
         x >= offset && y >= offset && x < left_length + offset && y < right_length + offset
     };
 
-    // This constant is the value used by GNU diff; using it should give us
-    // more similar diffs.
-    const HIGH_COST: isize = 200;
-
-    // This magic number was borrowed from GNU diff - apparently this is a
-    // good number for modern CPUs.
-    let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
-    info!(too_expensive = too_expensive);
-
     let mut best_snake = Snake::default();
 
     let forward_span = tracing::span!(Level::TRACE, "forward");
     let backward_span = tracing::span!(Level::TRACE, "backward");
     'outer: for c in 1..max_cost {
+        *total_cost += 1;
+
         info!(c = c, snake_length = best_snake.length);
         // The files appear to be large and too different. Go for good enough
         if c > too_expensive {
@@ -253,7 +279,8 @@ pub fn diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
 ) -> Vec<Edit<'a, T>> {
     trace!(left_length = left.len(), right_length = right.len());
     let mut edits = vec![];
-    do_diff(left, right, &mut edits);
+    let mut total_cost = 0;
+    do_diff(left, right, &mut edits, &mut total_cost);
     edits
 }
 
@@ -262,6 +289,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
     left: &'a [T],
     right: &'a [T],
     edits: &mut Vec<Edit<'a, T>>,
+    total_cost: &mut usize,
 ) {
     if left.is_empty() {
         right.iter().for_each(|r| edits.push(Edit::Insert(r)));
@@ -296,7 +324,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
     let left_remaining = &left[leading_matches..left.len() - trailing_matches];
     let right_remaining = &right[leading_matches..right.len() - trailing_matches];
 
-    let snake = find_split_point(left_remaining, right_remaining);
+    let snake = find_split_point(left_remaining, right_remaining, total_cost);
 
     trace!(x = snake.x, y = snake.y, length = snake.length, "snake");
 
@@ -321,8 +349,8 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
             "split"
         );
 
-        do_diff(l1, r1, edits);
-        do_diff(l2, r2, edits);
+        do_diff(l1, r1, edits, total_cost);
+        do_diff(l2, r2, edits, total_cost);
     }
 
     // Finally add the trailing matches.