Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

diff: remove unneeded hash map lookup #4546

Merged
merged 3 commits into from
Sep 27, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 44 additions & 52 deletions lib/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,12 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
}
}

struct Histogram<'a> {
word_to_positions: HashMap<&'a BStr, Vec<WordPosition>>,
count_to_words: BTreeMap<usize, Vec<&'a BStr>>,
struct Histogram<'input> {
word_to_positions: HashMap<&'input BStr, Vec<WordPosition>>,
}

impl Histogram<'_> {
fn calculate<'a>(source: &DiffSource<'a, '_>, max_occurrences: usize) -> Histogram<'a> {
impl<'input> Histogram<'input> {
fn calculate(source: &DiffSource<'input, '_>, max_occurrences: usize) -> Self {
let mut word_to_positions: HashMap<&BStr, Vec<WordPosition>> = HashMap::new();
for (i, range) in source.ranges.iter().enumerate() {
let word = &source.text[range.clone()];
Expand All @@ -120,14 +119,16 @@ impl Histogram<'_> {
positions.push(WordPosition(i));
}
}
let mut count_to_words: BTreeMap<usize, Vec<&BStr>> = BTreeMap::new();
for (word, ranges) in &word_to_positions {
count_to_words.entry(ranges.len()).or_default().push(word);
}
Histogram {
word_to_positions,
count_to_words,
Histogram { word_to_positions }
}

fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<(&'input BStr, &Vec<WordPosition>)>> {
let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
for (word, positions) in &self.word_to_positions {
let entries = count_to_entries.entry(positions.len()).or_default();
entries.push((*word, positions));
}
count_to_entries
}
}

Expand Down Expand Up @@ -233,59 +234,50 @@ fn unchanged_ranges_lcs(
) -> Vec<(Range<usize>, Range<usize>)> {
let max_occurrences = 100;
let left_histogram = Histogram::calculate(left, max_occurrences);
if *left_histogram.count_to_words.keys().next().unwrap() > max_occurrences {
let left_count_to_entries = left_histogram.build_count_to_entries();
if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
// If there are very many occurrences of all words, then we just give up.
return vec![];
}
let right_histogram = Histogram::calculate(right, max_occurrences);
// Look for words with few occurrences in `left` (could equally well have picked
// `right`?). If any of them also occur in `right`, then we add the words to
// the LCS.
let Some(uncommon_shared_words) = left_histogram
.count_to_words
.iter()
.map(|(left_count, left_words)| -> Vec<&BStr> {
left_words
let Some(uncommon_shared_word_positions) =
left_count_to_entries.values().find_map(|left_entries| {
let mut both_positions = left_entries
.iter()
.copied()
.filter(|left_word| {
let right_count = right_histogram
.word_to_positions
.get(left_word)
.map_or(0, |right_positions| right_positions.len());
*left_count == right_count
.filter_map(|&(word, left_positions)| {
let right_positions = right_histogram.word_to_positions.get(word)?;
(left_positions.len() == right_positions.len())
.then_some((left_positions, right_positions))
})
.collect()
.peekable();
both_positions.peek().is_some().then_some(both_positions)
})
.find(|words| !words.is_empty())
else {
return vec![];
};

// [(index into left_ranges, word, occurrence #)]
let mut left_positions = vec![];
let mut right_positions = vec![];
for uncommon_shared_word in uncommon_shared_words {
let left_occurrences = &left_histogram.word_to_positions[uncommon_shared_word];
let right_occurrences = &right_histogram.word_to_positions[uncommon_shared_word];
assert_eq!(left_occurrences.len(), right_occurrences.len());
for (occurrence, (&left_pos, &right_pos)) in
iter::zip(left_occurrences, right_occurrences).enumerate()
{
left_positions.push((left_pos, uncommon_shared_word, occurrence));
right_positions.push((right_pos, uncommon_shared_word, occurrence));
// [(index into ranges, serial to identify {word, occurrence #})]
let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
uncommon_shared_word_positions
.flat_map(|(lefts, rights)| iter::zip(lefts, rights))
.enumerate()
.map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
.unzip();
left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
let left_index_by_right_index: Vec<usize> = {
let mut left_index_map = vec![0; left_positions.len()];
for (i, &(_pos, serial)) in left_positions.iter().enumerate() {
left_index_map[serial] = i;
}
}
left_positions.sort_unstable_by_key(|(pos, _word, _occurence)| *pos);
right_positions.sort_unstable_by_key(|(pos, _word, _occurence)| *pos);
let mut left_position_map = HashMap::new();
for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() {
left_position_map.insert((*word, *occurrence), i);
}
let mut left_index_by_right_index = vec![];
for (_pos, word, occurrence) in &right_positions {
left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
}
right_positions
.iter()
.map(|&(_pos, serial)| left_index_map[serial])
.collect()
};

let lcs = find_lcs(&left_index_by_right_index);

Expand All @@ -295,8 +287,8 @@ fn unchanged_ranges_lcs(
let mut previous_left_position = WordPosition(0);
let mut previous_right_position = WordPosition(0);
for (left_index, right_index) in lcs {
let left_position = left_positions[left_index].0;
let right_position = right_positions[right_index].0;
let (left_position, _) = left_positions[left_index];
let (right_position, _) = right_positions[right_index];
let skipped_left_positions = previous_left_position..left_position;
let skipped_right_positions = previous_right_position..right_position;
if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
Expand Down