From 5b4b7a2864b05e042f11ccbc9b992f1e83e714a6 Mon Sep 17 00:00:00 2001
From: Yuya Nishihara <yuya@tcha.org>
Date: Tue, 24 Sep 2024 13:23:36 +0900
Subject: [PATCH] diff: do not translate word-range indices by
 collect_unchanged_ranges()

Intersection of unchanged ranges becomes a simple merge-join loop, so I've
removed the existing tests. I also added a fast path for the common 2-way
diffs in which we don't have to build vec![(pos, vec![pos])].

One source of confusion introduced by this change is that WordPosition means
both global and local indices. This is covered by the added tests, but I'll
probably add separate local/global types later.
---
 lib/src/diff.rs | 357 ++++++++++++++++++++++++++----------------------
 1 file changed, 190 insertions(+), 167 deletions(-)
diff --git a/lib/src/diff.rs b/lib/src/diff.rs
index b482f34640..9aa8be879d 100644
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@@ -14,8 +14,6 @@
 
 #![allow(missing_docs)]
 
-use std::cmp::max;
-use std::cmp::min;
 use std::cmp::Ordering;
 use std::collections::BTreeMap;
 use std::collections::HashMap;
@@ -81,6 +79,8 @@ struct WordPosition(usize);
 struct DiffSource<'input, 'aux> {
     text: &'input BStr,
     ranges: &'aux [Range<usize>],
+    /// The number of preceding word ranges excluded from the self `ranges`.
+    global_offset: WordPosition,
 }
 
 impl<'input, 'aux> DiffSource<'input, 'aux> {
@@ -88,6 +88,7 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
         DiffSource {
             text: BStr::new(text),
             ranges,
+            global_offset: WordPosition(0),
         }
     }
 
@@ -95,12 +96,17 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
         DiffSource {
             text: self.text,
             ranges: &self.ranges[positions.start.0..positions.end.0],
+            global_offset: self.map_to_global(positions.start),
         }
     }
 
     fn range_at(&self, position: WordPosition) -> Range<usize> {
         self.ranges[position.0].clone()
     }
+
+    fn map_to_global(&self, position: WordPosition) -> WordPosition {
+        WordPosition(self.global_offset.0 + position.0)
+    }
 }
 
 struct Histogram<'a> {
@@ -187,10 +193,10 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
     result
 }
 
-/// Finds unchanged ranges among the ones given as arguments. The data between
-/// those ranges is ignored.
-fn collect_unchanged_ranges(
-    found_ranges: &mut Vec<(Range<usize>, Range<usize>)>,
+/// Finds unchanged word (or token) positions among the ones given as
+/// arguments. The data between those words is ignored.
+fn collect_unchanged_words(
+    found_positions: &mut Vec<(WordPosition, WordPosition)>,
     left: &DiffSource,
     right: &DiffSource,
 ) {
@@ -199,9 +205,9 @@ fn collect_unchanged_ranges(
     }
 
     // Prioritize LCS-based algorithm than leading/trailing matches
-    let old_len = found_ranges.len();
-    collect_unchanged_ranges_lcs(found_ranges, left, right);
-    if found_ranges.len() != old_len {
+    let old_len = found_positions.len();
+    collect_unchanged_words_lcs(found_positions, left, right);
+    if found_positions.len() != old_len {
         return;
     }
 
@@ -209,30 +215,32 @@ fn collect_unchanged_ranges(
     let common_leading_len = iter::zip(left.ranges, right.ranges)
         .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
         .count();
-    let (left_leading_ranges, left_ranges) = left.ranges.split_at(common_leading_len);
-    let (right_leading_ranges, right_ranges) = right.ranges.split_at(common_leading_len);
+    let left_ranges = &left.ranges[common_leading_len..];
+    let right_ranges = &right.ranges[common_leading_len..];
 
     // Trim trailing common ranges (i.e. grow next unchanged region)
     let common_trailing_len = iter::zip(left_ranges.iter().rev(), right_ranges.iter().rev())
         .take_while(|&(l, r)| left.text[l.clone()] == right.text[r.clone()])
         .count();
-    let left_trailing_ranges = &left_ranges[(left_ranges.len() - common_trailing_len)..];
-    let right_trailing_ranges = &right_ranges[(right_ranges.len() - common_trailing_len)..];
-
-    found_ranges.extend(itertools::chain(
-        iter::zip(
-            left_leading_ranges.iter().cloned(),
-            right_leading_ranges.iter().cloned(),
-        ),
-        iter::zip(
-            left_trailing_ranges.iter().cloned(),
-            right_trailing_ranges.iter().cloned(),
-        ),
+
+    found_positions.extend(itertools::chain(
+        (0..common_leading_len).map(|i| {
+            (
+                left.map_to_global(WordPosition(i)),
+                right.map_to_global(WordPosition(i)),
+            )
+        }),
+        (1..=common_trailing_len).rev().map(|i| {
+            (
+                left.map_to_global(WordPosition(left.ranges.len() - i)),
+                right.map_to_global(WordPosition(right.ranges.len() - i)),
+            )
+        }),
     ));
 }
 
-fn collect_unchanged_ranges_lcs(
-    found_ranges: &mut Vec<(Range<usize>, Range<usize>)>,
+fn collect_unchanged_words_lcs(
+    found_positions: &mut Vec<(WordPosition, WordPosition)>,
     left: &DiffSource,
     right: &DiffSource,
 ) {
@@ -294,30 +302,52 @@ fn collect_unchanged_ranges_lcs(
 
     let lcs = find_lcs(&left_index_by_right_index);
 
-    // Produce output ranges, recursing into the modified areas between the elements
-    // in the LCS.
+    // Produce output word positions, recursing into the modified areas between
+    // the elements in the LCS.
     let mut previous_left_position = WordPosition(0);
     let mut previous_right_position = WordPosition(0);
     for (left_index, right_index) in lcs {
         let left_position = left_positions[left_index].0;
         let right_position = right_positions[right_index].0;
-        collect_unchanged_ranges(
-            found_ranges,
+        collect_unchanged_words(
+            found_positions,
             &left.narrowed(previous_left_position..left_position),
             &right.narrowed(previous_right_position..right_position),
         );
-        found_ranges.push((left.range_at(left_position), right.range_at(right_position)));
+        found_positions.push((
+            left.map_to_global(left_position),
+            right.map_to_global(right_position),
+        ));
         previous_left_position = WordPosition(left_position.0 + 1);
         previous_right_position = WordPosition(right_position.0 + 1);
     }
     // Also recurse into range at end (after common ranges).
-    collect_unchanged_ranges(
-        found_ranges,
+    collect_unchanged_words(
+        found_positions,
         &left.narrowed(previous_left_position..WordPosition(left.ranges.len())),
         &right.narrowed(previous_right_position..WordPosition(right.ranges.len())),
     );
 }
 
+/// Intersects two sorted sequences of `(base, other)` word positions by
+/// `base`. `base` positions should refer to the same source text.
+fn intersect_unchanged_words(
+    current_positions: Vec<(WordPosition, Vec<WordPosition>)>,
+    new_positions: &[(WordPosition, WordPosition)],
+) -> Vec<(WordPosition, Vec<WordPosition>)> {
+    itertools::merge_join_by(
+        current_positions,
+        new_positions,
+        |(cur_base_pos, _), (new_base_pos, _)| cur_base_pos.cmp(new_base_pos),
+    )
+    .filter_map(|entry| entry.both())
+    .map(|((base_pos, mut other_positions), &(_, new_other_pos))| {
+        other_positions.push(new_other_pos);
+        (base_pos, other_positions)
+    })
+    .collect()
+}
+
 #[derive(Clone, PartialEq, Eq, Debug)]
 struct UnchangedRange {
     base_range: Range<usize>,
@@ -325,6 +355,28 @@ struct UnchangedRange {
 }
 
 impl UnchangedRange {
+    /// Translates word positions to byte ranges in the source texts.
+    fn from_word_positions(
+        base_source: &DiffSource,
+        other_sources: &[DiffSource],
+        base_position: WordPosition,
+        other_positions: &[WordPosition],
+    ) -> Self {
+        assert_eq!(other_sources.len(), other_positions.len());
+        let base_range = base_source.range_at(base_position);
+        let offsets = iter::zip(other_sources, other_positions)
+            .map(|(source, pos)| {
+                let other_range = source.range_at(*pos);
+                assert_eq!(base_range.len(), other_range.len());
+                other_range.start.wrapping_sub(base_range.start) as isize
+            })
+            .collect();
+        UnchangedRange {
+            base_range,
+            offsets,
+        }
+    }
+
     fn start(&self, side: usize) -> usize {
         self.base_range
             .start
@@ -365,51 +417,6 @@ pub struct Diff<'input> {
     unchanged_regions: Vec<UnchangedRange>,
 }
 
-/// Takes the current regions and intersects it with the new unchanged ranges
-/// from a 2-way diff. The result is a map of unchanged regions with one more
-/// offset in the map's values.
-fn intersect_regions(
-    current_ranges: Vec<UnchangedRange>,
-    new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
-) -> Vec<UnchangedRange> {
-    let mut result = vec![];
-    let mut current_ranges_iter = current_ranges.into_iter().peekable();
-    for (new_base_range, other_range) in new_unchanged_ranges.iter() {
-        assert_eq!(new_base_range.len(), other_range.len());
-        while let Some(UnchangedRange {
-            base_range,
-            offsets,
-        }) = current_ranges_iter.peek()
-        {
-            // No need to look further if we're past the new range.
-            if base_range.start >= new_base_range.end {
-                break;
-            }
-            // Discard any current unchanged regions that don't match between the base and
-            // the new input.
-            if base_range.end <= new_base_range.start {
-                current_ranges_iter.next();
-                continue;
-            }
-            let new_start = max(base_range.start, new_base_range.start);
-            let new_end = min(base_range.end, new_base_range.end);
-            let mut new_offsets = offsets.clone();
-            new_offsets.push(other_range.start.wrapping_sub(new_base_range.start) as isize);
-            result.push(UnchangedRange {
-                base_range: new_start..new_end,
-                offsets: new_offsets,
-            });
-            if base_range.end >= new_base_range.end {
-                // Break without consuming the item; there may be other new ranges that overlap
-                // with it.
-                break;
-            }
-            current_ranges_iter.next();
-        }
-    }
-    result
-}
-
 impl<'input> Diff<'input> {
     pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
         inputs: impl IntoIterator<Item = &'input T>,
@@ -448,21 +455,64 @@ impl<'input> Diff<'input> {
         other_token_ranges: &[Vec<Range<usize>>],
     ) -> Self {
         assert_eq!(other_inputs.len(), other_token_ranges.len());
-        // Look for unchanged regions. Initially consider the whole range of the base
-        // input as unchanged (compared to itself). Then diff each other input
-        // against the base. Intersect the previously found ranges with the
-        // unchanged ranges in the diff.
         let base_source = DiffSource::new(base_input, base_token_ranges);
-        let mut unchanged_regions = vec![UnchangedRange {
-            base_range: 0..base_input.len(),
-            offsets: vec![],
-        }];
-        for (other_input, other_token_ranges) in iter::zip(&other_inputs, other_token_ranges) {
-            let other_source = DiffSource::new(other_input, other_token_ranges);
-            let mut new_ranges = Vec::new();
-            collect_unchanged_ranges(&mut new_ranges, &base_source, &other_source);
-            unchanged_regions = intersect_regions(unchanged_regions, &new_ranges);
-        }
+        let other_sources = iter::zip(&other_inputs, other_token_ranges)
+            .map(|(input, token_ranges)| DiffSource::new(input, token_ranges))
+            .collect_vec();
+        let mut unchanged_regions = match &*other_sources {
+            // Consider the whole range of the base input as unchanged compared
+            // to itself.
+            [] => {
+                let whole_range = UnchangedRange {
+                    base_range: 0..base_source.text.len(),
+                    offsets: vec![],
+                };
+                vec![whole_range]
+            }
+            // Diff each other input against the base. Intersect the previously
+            // found ranges with the ranges in the diff.
+            [first_other_source, tail_other_sources @ ..] => {
+                let mut first_positions = Vec::new();
+                collect_unchanged_words(&mut first_positions, &base_source, first_other_source);
+                if tail_other_sources.is_empty() {
+                    first_positions
+                        .iter()
+                        .map(|&(base_pos, other_pos)| {
+                            UnchangedRange::from_word_positions(
+                                &base_source,
+                                &other_sources,
+                                base_pos,
+                                &[other_pos],
+                            )
+                        })
+                        .collect()
+                } else {
+                    let first_positions = first_positions
+                        .iter()
+                        .map(|&(base_pos, other_pos)| (base_pos, vec![other_pos]))
+                        .collect();
+                    let intersected_positions = tail_other_sources.iter().fold(
+                        first_positions,
+                        |current_positions, other_source| {
+                            let mut new_positions = Vec::new();
+                            collect_unchanged_words(&mut new_positions, &base_source, other_source);
+                            intersect_unchanged_words(current_positions, &new_positions)
+                        },
+                    );
+                    intersected_positions
+                        .iter()
+                        .map(|(base_pos, other_positions)| {
+                            UnchangedRange::from_word_positions(
+                                &base_source,
+                                &other_sources,
+                                *base_pos,
+                                other_positions,
+                            )
+                        })
+                        .collect()
+                }
+            }
+        };
         // Add an empty range at the end to make life easier for hunks().
         let offsets = other_inputs
             .iter()
@@ -790,9 +840,12 @@ mod tests {
         left: &DiffSource,
         right: &DiffSource,
     ) -> Vec<(Range<usize>, Range<usize>)> {
-        let mut found_ranges = Vec::new();
-        collect_unchanged_ranges(&mut found_ranges, left, right);
-        found_ranges
+        let mut positions = Vec::new();
+        collect_unchanged_words(&mut positions, left, right);
+        positions
+            .into_iter()
+            .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
+            .collect()
     }
 
     #[test]
@@ -875,82 +928,52 @@ mod tests {
     }
 
     #[test]
-    fn test_intersect_regions_existing_empty() {
-        let actual = intersect_regions(vec![], &[(20..25, 55..60)]);
-        let expected = vec![];
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn test_intersect_regions_new_ranges_within_existing() {
-        let actual = intersect_regions(
-            vec![UnchangedRange {
-                base_range: 20..70,
-                offsets: vec![3],
-            }],
-            &[(25..30, 35..40), (40..50, 40..50)],
+    fn test_unchanged_ranges_recursion_needed() {
+        // "|" matches first, then "b" matches within the left/right range.
+        assert_eq!(
+            unchanged_ranges(
+                &DiffSource::new(b"a b | b", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"b c d |", &[0..1, 2..3, 4..5, 6..7]),
+            ),
+            vec![(2..3, 0..1), (4..5, 6..7)]
         );
-        let expected = vec![
-            UnchangedRange {
-                base_range: 25..30,
-                offsets: vec![3, 10],
-            },
-            UnchangedRange {
-                base_range: 40..50,
-                offsets: vec![3, 0],
-            },
-        ];
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn test_intersect_regions_partial_overlap() {
-        let actual = intersect_regions(
-            vec![UnchangedRange {
-                base_range: 20..50,
-                offsets: vec![-3],
-            }],
-            &[(15..25, 5..15), (45..60, 55..70)],
+        assert_eq!(
+            unchanged_ranges(
+                &DiffSource::new(b"| b c d", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"b | a b", &[0..1, 2..3, 4..5, 6..7]),
+            ),
+            vec![(0..1, 2..3), (2..3, 6..7)]
         );
-        let expected = vec![
-            UnchangedRange {
-                base_range: 20..25,
-                offsets: vec![-3, -10],
-            },
-            UnchangedRange {
-                base_range: 45..50,
-                offsets: vec![-3, 10],
-            },
-        ];
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn test_intersect_regions_new_range_overlaps_multiple_existing() {
-        let actual = intersect_regions(
-            vec![
-                UnchangedRange {
-                    base_range: 20..50,
-                    offsets: vec![3, -8],
-                },
-                UnchangedRange {
-                    base_range: 70..80,
-                    offsets: vec![7, 1],
-                },
-            ],
-            &[(10..100, 5..95)],
+        // "|" matches first, then the middle range is trimmed.
+        assert_eq!(
+            unchanged_ranges(
+                &DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"| b b |", &[0..1, 2..3, 4..5, 6..7]),
+            ),
+            vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)]
+        );
+        assert_eq!(
+            unchanged_ranges(
+                &DiffSource::new(b"| c c |", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
+            ),
+            vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)]
+        );
+        // "|" matches first, then "a", then "b".
+        assert_eq!(
+            unchanged_ranges(
+                &DiffSource::new(b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]),
+                &DiffSource::new(b"b a b |", &[0..1, 2..3, 4..5, 6..7]),
+            ),
+            vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)]
+        );
+        assert_eq!(
+            unchanged_ranges(
+                &DiffSource::new(b"| b a b", &[0..1, 2..3, 4..5, 6..7]),
+                &DiffSource::new(b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
+            ),
+            vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)]
         );
-        let expected = vec![
-            UnchangedRange {
-                base_range: 20..50,
-                offsets: vec![3, -8, -5],
-            },
-            UnchangedRange {
-                base_range: 70..80,
-                offsets: vec![7, 1, -5],
-            },
-        ];
-        assert_eq!(actual, expected);
     }
 
     #[test]