jj-vcs · yuja · Sep 27, 2024 · Sep 26, 2024 · Sep 26, 2024 · Sep 27, 2024
diff --git a/lib/src/diff.rs b/lib/src/diff.rs
@@ -103,13 +103,12 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
     }
 }
 
-struct Histogram<'a> {
-    word_to_positions: HashMap<&'a BStr, Vec<WordPosition>>,
-    count_to_words: BTreeMap<usize, Vec<&'a BStr>>,
+struct Histogram<'input> {
+    word_to_positions: HashMap<&'input BStr, Vec<WordPosition>>,
 }
 
-impl Histogram<'_> {
-    fn calculate<'a>(source: &DiffSource<'a, '_>, max_occurrences: usize) -> Histogram<'a> {
+impl<'input> Histogram<'input> {
+    fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self {
         let mut word_to_positions: HashMap<&BStr, Vec<WordPosition>> = HashMap::new();
         for (i, range) in source.ranges.iter().enumerate() {
             let word = &source.text[range.clone()];
@@ -120,14 +119,16 @@ impl Histogram<'_> {
                 positions.push(WordPosition(i));
             }
         }
-        let mut count_to_words: BTreeMap<usize, Vec<&BStr>> = BTreeMap::new();
-        for (word, ranges) in &word_to_positions {
-            count_to_words.entry(ranges.len()).or_default().push(word);
-        }
-        Histogram {
-            word_to_positions,
-            count_to_words,
+        Histogram { word_to_positions }
+    }
+
+    fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
+        let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
+        for (word, positions) in &self.word_to_positions {
+            let entries = count_to_entries.entry(positions.len()).or_default();
+            entries.push((*word, positions));
         }
+        count_to_entries
     }
 }
 
@@ -233,59 +234,50 @@ fn unchanged_ranges_lcs(
 ) -> Vec<(Range<usize>, Range<usize>)> {
     let max_occurrences = 100;
     let left_histogram = Histogram::calculate(left, max_occurrences);
-    if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences {
+    let left_count_to_entries = left_histogram.build_count_to_entries();
+    if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
         // If there are very many occurrences of all words, then we just give up.
         return vec![];
     }
     let right_histogram = Histogram::calculate(right, max_occurrences);
     // Look for words with few occurrences in `left` (could equally well have picked
     // `right`?). If any of them also occur in `right`, then we add the words to
     // the LCS.
-    let Some(uncommon_shared_words) = left_histogram
-        .count_to_words
-        .iter()
-        .map(|(left_count, left_words)| -> Vec<&BStr> {
-            left_words
+    let Some(uncommon_shared_word_positions) =
+        left_count_to_entries.values().find_map(|left_entries| {
+            let mut both_positions = left_entries
                 .iter()
-                .copied()
-                .filter(|left_word| {
-                    let right_count = right_histogram
-                        .word_to_positions
-                        .get(left_word)
-                        .map_or(0, |right_positions| right_positions.len());
-                    *left_count == right_count
+                .filter_map(|&(word, left_positions)| {
+                    let right_positions = right_histogram.word_to_positions.get(word)?;
+                    (left_positions.len() == right_positions.len())
+                        .then_some((left_positions, right_positions))
                 })
-                .collect()
+                .peekable();
+            both_positions.peek().is_some().then_some(both_positions)
         })
-        .find(|words| !words.is_empty())
     else {
         return vec![];
     };
 
-    // [(index into left_ranges, word, occurrence #)]
-    let mut left_positions = vec![];
-    let mut right_positions = vec![];
-    for uncommon_shared_word in uncommon_shared_words {
-        let left_occurrences = &left_histogram.word_to_positions[uncommon_shared_word];
-        let right_occurrences = &right_histogram.word_to_positions[uncommon_shared_word];
-        assert_eq!(left_occurrences.len(), right_occurrences.len());
-        for (occurrence, (&left_pos, &right_pos)) in
-            iter::zip(left_occurrences, right_occurrences).enumerate()
-        {
-            left_positions.push((left_pos, uncommon_shared_word, occurrence));
-            right_positions.push((right_pos, uncommon_shared_word, occurrence));
+    // [(index into ranges, serial to identify {word, occurrence #})]
+    let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
+        uncommon_shared_word_positions
+            .flat_map(|(lefts, rights)| iter::zip(lefts, rights))
+            .enumerate()
+            .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
+            .unzip();
+    left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
+    right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
+    let left_index_by_right_index: Vec<usize> = {
+        let mut left_index_map = vec![0; left_positions.len()];
+        for (i, &(_pos, serial)) in left_positions.iter().enumerate() {
+            left_index_map[serial] = i;
         }
-    }
-    left_positions.sort_unstable_by_key(|(pos, _word, _occurence)| *pos);
-    right_positions.sort_unstable_by_key(|(pos, _word, _occurence)| *pos);
-    let mut left_position_map = HashMap::new();
-    for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() {
-        left_position_map.insert((*word, *occurrence), i);
-    }
-    let mut left_index_by_right_index = vec![];
-    for (_pos, word, occurrence) in &right_positions {
-        left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
-    }
+        right_positions
+            .iter()
+            .map(|&(_pos, serial)| left_index_map[serial])
+            .collect()
+    };
 
     let lcs = find_lcs(&left_index_by_right_index);
 
@@ -295,8 +287,8 @@ fn unchanged_ranges_lcs(
     let mut previous_left_position = WordPosition(0);
     let mut previous_right_position = WordPosition(0);
     for (left_index, right_index) in lcs {
-        let left_position = left_positions[left_index].0;
-        let right_position = right_positions[right_index].0;
+        let (left_position, _) = left_positions[left_index];
+        let (right_position, _) = right_positions[right_index];
         let skipped_left_positions = previous_left_position..left_position;
         let skipped_right_positions = previous_right_position..right_position;
         if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {