Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

diff: handle "matching" regions of different lengths, minor cleanup #4522

Merged
merged 3 commits into from
Oct 1, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 56 additions & 77 deletions lib/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ fn intersect_unchanged_words(

#[derive(Clone, PartialEq, Eq, Debug)]
struct UnchangedRange {
base_range: Range<usize>,
offsets: Vec<isize>,
base: Range<usize>,
others: Vec<Range<usize>>,
}

impl UnchangedRange {
Expand All @@ -355,30 +355,11 @@ impl UnchangedRange {
other_positions: &[WordPosition],
) -> Self {
assert_eq!(other_sources.len(), other_positions.len());
let base_range = base_source.range_at(base_position);
let offsets = iter::zip(other_sources, other_positions)
.map(|(source, pos)| {
let other_range = source.range_at(*pos);
assert_eq!(base_range.len(), other_range.len());
other_range.start.wrapping_sub(base_range.start) as isize
})
let base = base_source.range_at(base_position);
let others = iter::zip(other_sources, other_positions)
.map(|(source, pos)| source.range_at(*pos))
.collect();
UnchangedRange {
base_range,
offsets,
}
}

fn start(&self, side: usize) -> usize {
self.base_range
.start
.wrapping_add(self.offsets[side] as usize)
}

fn end(&self, side: usize) -> usize {
self.base_range
.end
.wrapping_add(self.offsets[side] as usize)
UnchangedRange { base, others }
}
}

Expand All @@ -390,10 +371,10 @@ impl PartialOrd for UnchangedRange {

impl Ord for UnchangedRange {
fn cmp(&self, other: &Self) -> Ordering {
self.base_range
self.base
.start
.cmp(&other.base_range.start)
.then_with(|| self.base_range.end.cmp(&other.base_range.end))
.cmp(&other.base.start)
.then_with(|| self.base.end.cmp(&other.base.end))
}
}

Expand All @@ -403,9 +384,6 @@ impl Ord for UnchangedRange {
pub struct Diff<'input> {
base_input: &'input BStr,
other_inputs: Vec<&'input BStr>,
// The key is a range in the base input. The value is the start of each non-base region
// relative to the base region's start. By making them relative, they don't need to change
// when the base range changes.
unchanged_regions: Vec<UnchangedRange>,
}

Expand Down Expand Up @@ -456,8 +434,8 @@ impl<'input> Diff<'input> {
// to itself.
[] => {
let whole_range = UnchangedRange {
base_range: 0..base_source.text.len(),
offsets: vec![],
base: 0..base_source.text.len(),
others: vec![],
};
vec![whole_range]
}
Expand Down Expand Up @@ -506,13 +484,12 @@ impl<'input> Diff<'input> {
}
};
// Add an empty range at the end to make life easier for hunks().
let offsets = other_inputs
.iter()
.map(|input| input.len().wrapping_sub(base_input.len()) as isize)
.collect_vec();
unchanged_regions.push(UnchangedRange {
base_range: base_input.len()..base_input.len(),
offsets,
base: base_input.len()..base_input.len(),
others: other_inputs
.iter()
.map(|input| input.len()..input.len())
.collect(),
});

let mut diff = Self {
Expand Down Expand Up @@ -550,53 +527,54 @@ impl<'input> Diff<'input> {
}

pub fn hunks<'diff>(&'diff self) -> DiffHunkIterator<'diff, 'input> {
let previous_offsets = vec![0; self.other_inputs.len()];
DiffHunkIterator {
diff: self,
previous: UnchangedRange {
base_range: 0..0,
offsets: previous_offsets,
base: 0..0,
others: vec![0..0; self.other_inputs.len()],
},
unchanged_emitted: true,
unchanged_iter: self.unchanged_regions.iter(),
}
}

/// Returns contents between the `previous` ends and the `current` starts.
fn hunk_between<'a>(
&'a self,
previous: &'a UnchangedRange,
current: &'a UnchangedRange,
) -> impl Iterator<Item = &'input BStr> + 'a {
itertools::chain(
iter::once(&self.base_input[previous.base.end..current.base.start]),
itertools::izip!(&self.other_inputs, &previous.others, &current.others)
.map(|(input, prev, cur)| &input[prev.end..cur.start]),
)
}

/// Uses the given tokenizer to split the changed regions into smaller
/// regions. Then tries to finds unchanged regions among them.
pub fn refine_changed_regions(&mut self, tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>) {
let mut previous = UnchangedRange {
base_range: 0..0,
offsets: vec![0; self.other_inputs.len()],
base: 0..0,
others: vec![0..0; self.other_inputs.len()],
};
let mut new_unchanged_ranges = vec![];
for current in self.unchanged_regions.iter() {
// For the changed region between the previous region and the current one,
// create a new Diff instance. Then adjust the start positions and
// offsets to be valid in the context of the larger Diff instance
// (`self`).
let mut slices =
vec![&self.base_input[previous.base_range.end..current.base_range.start]];
for i in 0..current.offsets.len() {
let changed_range = previous.end(i)..current.start(i);
slices.push(&self.other_inputs[i][changed_range]);
}

let refined_diff = Diff::for_tokenizer(slices, &tokenizer);

for UnchangedRange {
base_range,
offsets,
} in refined_diff.unchanged_regions
{
let new_base_start = base_range.start + previous.base_range.end;
let new_base_end = base_range.end + previous.base_range.end;
let offsets = iter::zip(offsets, &previous.offsets)
.map(|(refi, prev)| refi + prev)
.collect_vec();
let refined_diff =
Diff::for_tokenizer(self.hunk_between(&previous, current), &tokenizer);
for refined in &refined_diff.unchanged_regions {
let new_base_start = refined.base.start + previous.base.end;
let new_base_end = refined.base.end + previous.base.end;
let new_others = iter::zip(&refined.others, &previous.others)
.map(|(refi, prev)| (refi.start + prev.end)..(refi.end + prev.end))
.collect();
new_unchanged_ranges.push(UnchangedRange {
base_range: new_base_start..new_base_end,
offsets,
base: new_base_start..new_base_end,
others: new_others,
});
}
previous = current.clone();
Expand All @@ -615,12 +593,15 @@ impl<'input> Diff<'input> {
let mut maybe_previous: Option<UnchangedRange> = None;
for current in self.unchanged_regions.iter() {
if let Some(previous) = maybe_previous {
if previous.base_range.end == current.base_range.start
&& previous.offsets == *current.offsets
if previous.base.end == current.base.start
&& iter::zip(&previous.others, &current.others)
.all(|(prev, cur)| prev.end == cur.start)
{
maybe_previous = Some(UnchangedRange {
base_range: previous.base_range.start..current.base_range.end,
offsets: current.offsets.clone(),
base: previous.base.start..current.base.end,
others: iter::zip(&previous.others, &current.others)
.map(|(prev, cur)| prev.start..cur.end)
.collect(),
});
continue;
}
Expand Down Expand Up @@ -667,19 +648,17 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
loop {
if !self.unchanged_emitted {
self.unchanged_emitted = true;
if !self.previous.base_range.is_empty() {
if !self.previous.base.is_empty() {
return Some(DiffHunk::Matching(
&self.diff.base_input[self.previous.base_range.clone()],
&self.diff.base_input[self.previous.base.clone()],
));
}
}
if let Some(current) = self.unchanged_iter.next() {
let mut slices = vec![
&self.diff.base_input[self.previous.base_range.end..current.base_range.start],
];
for (i, input) in self.diff.other_inputs.iter().enumerate() {
slices.push(&input[self.previous.end(i)..current.start(i)]);
}
let slices = self
.diff
.hunk_between(&self.previous, current)
.collect_vec();
self.previous = current.clone();
self.unchanged_emitted = false;
if slices.iter().any(|slice| !slice.is_empty()) {
Expand Down