Skip to content

Commit

Permalink
diff: track total cost of search and bail if high
Browse files Browse the repository at this point in the history
This is the last piece of the puzzle to get somewhat comparable to GNU
diff performance without implementing all of its tricks - although this
one is also used by GNU diff, in its own way. It brings down a diff
which still takes over a minute with the previous commit to under a
second.

  Benchmark 1: diff test-data/b.cpp test-data/c.cpp
    Time (mean ± σ):      2.533 s ±  0.011 s    [User: 2.494 s, System: 0.027 s]
    Range (min … max):    2.519 s …  2.553 s    10 runs

    Warning: Ignoring non-zero exit code.

  Benchmark 2: ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp
    Time (mean ± σ):     65.798 s ±  1.080 s    [User: 65.367 s, System: 0.053 s]
    Range (min … max):   64.962 s … 68.137 s    10 runs

    Warning: Ignoring non-zero exit code.

  Benchmark 3: ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp
    Time (mean ± σ):     580.6 ms ±   6.5 ms    [User: 521.9 ms, System: 38.8 ms]
    Range (min … max):   570.7 ms … 589.6 ms    10 runs

    Warning: Ignoring non-zero exit code.

  Summary
    ./target/release/diffutils diff test-data/b.cpp test-data/c.cpp ran
      4.36 ± 0.05 times faster than diff test-data/b.cpp test-data/c.cpp
    113.33 ± 2.26 times faster than ./target/release/diffutils.local-heuristics diff test-data/b.cpp test-data/c.cpp

It basically keeps track of how much work we have done overall for a
diff job and enables giving up completely on trying to find ideal split
points if the cost implies we had to trigger the "too expensive"
heuristic too often.

From that point forward it only does naive splitting of the work.
This should not generate diffs which are much worse than doing the
diagonal search, as it should only trigger in cases in which the
files are so different it won't find good split points anyway.

This is another case in which GNU diff's additional work with hashing
and splitting large chunks of inclusion / deletion from the diff work
and trying harder to find ideal splits seem to cause it to perform
slightly poorer:

That said, GNU diff probably still generates better diffs not due to
this, but due to its post-processing of the results, trying to create
more hunks with nearby changes staying close to each other, which we
do not do (but we didn't do that before anyway).
  • Loading branch information
kov committed Nov 1, 2024
1 parent ed1bbfa commit 79e0bcc
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 13 deletions.
84 changes: 84 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ same-file = "1.0.6"
unicode-width = "0.2.0"
tracing = "0.1.40"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
rand = "0.8.5"

[dev-dependencies]
pretty_assertions = "1.4.0"
Expand Down
54 changes: 41 additions & 13 deletions src/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use std::fmt::Debug;
use std::ops::{Index, IndexMut, RangeInclusive};

use rand::Rng as _;
use tracing::{info, instrument, trace, Level};

#[derive(Debug, Default, PartialEq)]
Expand Down Expand Up @@ -43,12 +44,44 @@ impl Snake {
fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
left: &[T],
right: &[T],
total_cost: &mut usize,
) -> Snake {
let left_length = left.len() as isize;
let right_length = right.len() as isize;

let max_cost = left_length + right_length;

// This constant is the value used by GNU diff; using it should give us
// more similar diffs.
const HIGH_COST: isize = 200;

// This magic number was borrowed from GNU diff - apparently this is a
// good number for modern CPUs.
let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
info!(too_expensive = too_expensive);

// We've been constantly hitting the too expensive heuristic, this means the
// files are too different for us to get a good diff in reasonable amount of
// time. Do naive splits from now on.
if *total_cost as isize > too_expensive * 10 {
info!(
total_cost = total_cost,
"hit too costly overall heuristic, creating naive split"
);
let mut rng = rand::thread_rng();
let x = if left_length == 0 {
0
} else {
rng.gen_range(0..left.len())
};
let y = if right_length == 0 {
0
} else {
rng.gen_range(0..right.len())
};
return Snake { x, y, length: 0 };
}

// For collections of different sizes, the diagonals will not neatly balance. That means the
// "middle" diagonal for the backwards search will be offset from the forward one, so we need
// to keep track of that so we start at the right point.
Expand Down Expand Up @@ -87,20 +120,13 @@ fn find_split_point<T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
x >= offset && y >= offset && x < left_length + offset && y < right_length + offset
};

// This constant is the value used by GNU diff; using it should give us
// more similar diffs.
const HIGH_COST: isize = 200;

// This magic number was borrowed from GNU diff - apparently this is a
// good number for modern CPUs.
let too_expensive: isize = ((max_cost as f64).sqrt() as isize).max(4096);
info!(too_expensive = too_expensive);

let mut best_snake = Snake::default();

let forward_span = tracing::span!(Level::TRACE, "forward");
let backward_span = tracing::span!(Level::TRACE, "backward");
'outer: for c in 1..max_cost {
*total_cost += 1;

info!(c = c, snake_length = best_snake.length);
// The files appear to be large and too different. Go for good enough
if c > too_expensive {
Expand Down Expand Up @@ -253,7 +279,8 @@ pub fn diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
) -> Vec<Edit<'a, T>> {
trace!(left_length = left.len(), right_length = right.len());
let mut edits = vec![];
do_diff(left, right, &mut edits);
let mut total_cost = 0;
do_diff(left, right, &mut edits, &mut total_cost);
edits
}

Expand All @@ -262,6 +289,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
left: &'a [T],
right: &'a [T],
edits: &mut Vec<Edit<'a, T>>,
total_cost: &mut usize,
) {
if left.is_empty() {
right.iter().for_each(|r| edits.push(Edit::Insert(r)));
Expand Down Expand Up @@ -296,7 +324,7 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
let left_remaining = &left[leading_matches..left.len() - trailing_matches];
let right_remaining = &right[leading_matches..right.len() - trailing_matches];

let snake = find_split_point(left_remaining, right_remaining);
let snake = find_split_point(left_remaining, right_remaining, total_cost);

trace!(x = snake.x, y = snake.y, length = snake.length, "snake");

Expand All @@ -321,8 +349,8 @@ fn do_diff<'a, T: Clone + Debug + PartialEq + Into<Vec<u8>>>(
"split"
);

do_diff(l1, r1, edits);
do_diff(l2, r2, edits);
do_diff(l1, r1, edits, total_cost);
do_diff(l2, r2, edits, total_cost);
}

// Finally add the trailing matches.
Expand Down

0 comments on commit 79e0bcc

Please sign in to comment.