From 57c0d4f263e5b46f15583a90a04518c718f739de Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 20 Apr 2021 17:34:44 -0700 Subject: [PATCH 1/2] add minhash intersection benchmarks mergeless intersection/union, avoid dual iterating --- src/core/Cargo.toml | 4 + src/core/benches/minhash.rs | 54 +++++++++ src/core/src/sketch/minhash.rs | 210 +++++++++++++++++++++++++++------ 3 files changed, 232 insertions(+), 36 deletions(-) create mode 100644 src/core/benches/minhash.rs diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 4309017b89..3de8c37734 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -78,3 +78,7 @@ harness = false [[bench]] name = "nodegraph" harness = false + +[[bench]] +name = "minhash" +harness = false diff --git a/src/core/benches/minhash.rs b/src/core/benches/minhash.rs new file mode 100644 index 0000000000..a1c0f9ea3f --- /dev/null +++ b/src/core/benches/minhash.rs @@ -0,0 +1,54 @@ +#[macro_use] +extern crate criterion; + +use std::fs::File; +use std::io::{BufReader, Cursor, Read}; +use std::path::PathBuf; + +use sourmash::signature::Signature; +use sourmash::sketch::minhash::KmerMinHash; +use sourmash::sketch::Sketch; + +use criterion::Criterion; + +fn intersection(c: &mut Criterion) { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/gather-abund/genome-s10.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let mut sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + let mh = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] { + mh.clone() + } else { + unimplemented!() + }; + + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/gather-abund/genome-s11.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let mut sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + let mh2 = if let Sketch::MinHash(mh) = &sigs.swap_remove(0).sketches()[0] { + mh.clone() + } else { + unimplemented!() + }; + + let mut group = c.benchmark_group("minhash"); + group.sample_size(10); + + group.bench_function("intersection", |b| { + b.iter(|| { + mh.intersection(&mh2).unwrap(); + }); + }); + + group.bench_function("intersection_size", |b| { + b.iter(|| { + mh.intersection_size(&mh2).unwrap(); + }); + }); +} + +criterion_group!(minhash, intersection); +criterion_main!(minhash); diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 5dde2af805..da40c60bbd 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -579,27 +579,72 @@ impl KmerMinHash { pub fn intersection(&self, other: &KmerMinHash) -> Result<(Vec, u64), Error> { self.check_compatible(other)?; - let mut combined_mh = KmerMinHash::new( - self.scaled(), - self.ksize, - self.hash_function, - self.seed, - self.abunds.is_some(), - self.num, - ); - - combined_mh.merge(&self)?; - combined_mh.merge(&other)?; - - let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); - - // TODO: there is probably a way to avoid this Vec here, - // and pass the it1 as left in it2. - let i1: Vec = it1.cloned().collect(); - let it2 = Intersection::new(i1.iter(), combined_mh.mins.iter()); + if self.num != 0 { + // Intersection for regular MinHash sketches + let mut combined_mh = KmerMinHash::new( + self.scaled(), + self.ksize, + self.hash_function, + self.seed, + self.abunds.is_some(), + self.num, + ); + + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; + + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); + + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let it2 = Intersection::new(i1.iter(), combined_mh.mins.iter()); + + let common: Vec = it2.cloned().collect(); + Ok((common, combined_mh.mins.len() as u64)) + } else { + // Intersection for scaled MinHash sketches + + let mut me = self.mins.iter().peekable(); + let mut other = other.mins.iter().peekable(); + let mut common: Vec = vec![]; + let mut union_size = 0; + + loop { + match (me.peek(), other.peek()) { + (Some(ref left_key), Some(ref right_key)) => { + let res = left_key.cmp(right_key); + match res { + Ordering::Less => { + me.next(); + union_size += 1; + } + Ordering::Greater => { + other.next(); + union_size += 1; + } + Ordering::Equal => { + other.next(); + common.push(***left_key); + me.next(); + union_size += 1; + } + }; + } + (None, Some(_)) => { + other.next(); + union_size += 1; + } + (Some(_), None) => { + me.next(); + union_size += 1; + } + _ => break, + }; + } - let common: Vec = it2.cloned().collect(); - Ok((common, combined_mh.mins.len() as u64)) + Ok((common, union_size as u64)) + } } // FIXME: intersection_size and count_common should be the same? @@ -607,26 +652,69 @@ impl KmerMinHash { pub fn intersection_size(&self, other: &KmerMinHash) -> Result<(u64, u64), Error> { self.check_compatible(other)?; - let mut combined_mh = KmerMinHash::new( - self.scaled(), - self.ksize, - self.hash_function, - self.seed, - self.abunds.is_some(), - self.num, - ); + if self.num != 0 { + // Intersection for regular MinHash sketches + let mut combined_mh = KmerMinHash::new( + self.scaled(), + self.ksize, + self.hash_function, + self.seed, + self.abunds.is_some(), + self.num, + ); - combined_mh.merge(&self)?; - combined_mh.merge(&other)?; + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; - let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); - // TODO: there is probably a way to avoid this Vec here, - // and pass the it1 as left in it2. - let i1: Vec = it1.cloned().collect(); - let it2 = Intersection::new(i1.iter(), combined_mh.mins.iter()); + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let it2 = Intersection::new(i1.iter(), combined_mh.mins.iter()); - Ok((it2.count() as u64, combined_mh.mins.len() as u64)) + Ok((it2.count() as u64, combined_mh.mins.len() as u64)) + } else { + // Intersection for scaled MinHash sketches + let mut me = self.mins.iter().peekable(); + let mut other = other.mins.iter().peekable(); + let mut common = 0; + let mut union_size = 0; + + loop { + match (me.peek(), other.peek()) { + (Some(ref left_key), Some(ref right_key)) => { + let res = left_key.cmp(right_key); + match res { + Ordering::Less => { + me.next(); + union_size += 1; + } + Ordering::Greater => { + other.next(); + union_size += 1; + } + Ordering::Equal => { + other.next(); + me.next(); + common += 1; + union_size += 1; + } + }; + } + (None, Some(_)) => { + other.next(); + union_size += 1; + } + (Some(_), None) => { + me.next(); + union_size += 1; + } + _ => break, + }; + } + Ok((common as u64, union_size as u64)) + } } // calculate Jaccard similarity, ignoring abundance. @@ -869,6 +957,56 @@ impl> Iterator for Intersection { } } +struct Union> { + iter: Peekable, + other: Peekable, +} + +impl> Iterator for Union { + type Item = T; + + fn next(&mut self) -> Option { + let res = match (self.iter.peek(), self.other.peek()) { + (Some(ref left_key), Some(ref right_key)) => left_key.cmp(right_key), + (None, Some(_)) => { + return self.other.next(); + } + (Some(_), None) => { + return self.iter.next(); + } + _ => return None, + }; + + match res { + Ordering::Less => self.iter.next(), + Ordering::Greater => self.other.next(), + Ordering::Equal => { + self.other.next(); + self.iter.next() + } + } + } +} + +#[cfg(test)] +mod test { + use super::Union; + + #[test] + fn test_union() { + let v1 = [1u64, 2, 4, 10]; + let v2 = [1u64, 3, 4, 9]; + + let union: Vec = Union { + iter: v1.iter().peekable(), + other: v2.iter().peekable(), + } + .cloned() + .collect(); + assert_eq!(union, [1, 2, 3, 4, 9, 10]); + } +} + //############# // A MinHash implementation for low scaled or large cardinalities From d485860440014e32b69950d13452e754cf6797d4 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 7 May 2021 12:10:56 -0700 Subject: [PATCH 2/2] large intersection benchmarks --- src/core/benches/minhash.rs | 63 ++++++++- src/core/src/sketch/minhash.rs | 249 ++++++++++++++++++--------------- 2 files changed, 194 insertions(+), 118 deletions(-) diff --git a/src/core/benches/minhash.rs b/src/core/benches/minhash.rs index a1c0f9ea3f..c2e27b0249 100644 --- a/src/core/benches/minhash.rs +++ b/src/core/benches/minhash.rs @@ -2,11 +2,11 @@ extern crate criterion; use std::fs::File; -use std::io::{BufReader, Cursor, Read}; +use std::io::BufReader; use std::path::PathBuf; -use sourmash::signature::Signature; -use sourmash::sketch::minhash::KmerMinHash; +use sourmash::signature::{Signature, SigsTrait}; +use sourmash::sketch::minhash::{KmerMinHash, KmerMinHashBTree}; use sourmash::sketch::Sketch; use criterion::Criterion; @@ -48,6 +48,63 @@ fn intersection(c: &mut Criterion) { mh.intersection_size(&mh2).unwrap(); }); }); + + let mut mh1 = KmerMinHash::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + let mut mh2 = KmerMinHash::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + + let mut mh1_btree = KmerMinHashBTree::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + let mut mh2_btree = KmerMinHashBTree::builder() + .num(0) + .max_hash(1_000_000) + .ksize(21) + .build(); + + for i in 0..=1_000_000 { + if i % 2 == 0 { + mh1.add_hash(i); + mh1_btree.add_hash(i); + } + if i % 45 == 0 { + mh2.add_hash(i); + mh2_btree.add_hash(i); + } + } + + group.bench_function("large intersection", |b| { + b.iter(|| { + mh1.intersection(&mh2).unwrap(); + }); + }); + + group.bench_function("large intersection_size", |b| { + b.iter(|| { + mh1.intersection_size(&mh2).unwrap(); + }); + }); + + group.bench_function("large intersection btree", |b| { + b.iter(|| { + mh1_btree.intersection(&mh2_btree).unwrap(); + }); + }); + + group.bench_function("large intersection_size btree", |b| { + b.iter(|| { + mh1_btree.intersection_size(&mh2_btree).unwrap(); + }); + }); } criterion_group!(minhash, intersection); diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index da40c60bbd..a9d8a9d778 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -603,47 +603,7 @@ impl KmerMinHash { let common: Vec = it2.cloned().collect(); Ok((common, combined_mh.mins.len() as u64)) } else { - // Intersection for scaled MinHash sketches - - let mut me = self.mins.iter().peekable(); - let mut other = other.mins.iter().peekable(); - let mut common: Vec = vec![]; - let mut union_size = 0; - - loop { - match (me.peek(), other.peek()) { - (Some(ref left_key), Some(ref right_key)) => { - let res = left_key.cmp(right_key); - match res { - Ordering::Less => { - me.next(); - union_size += 1; - } - Ordering::Greater => { - other.next(); - union_size += 1; - } - Ordering::Equal => { - other.next(); - common.push(***left_key); - me.next(); - union_size += 1; - } - }; - } - (None, Some(_)) => { - other.next(); - union_size += 1; - } - (Some(_), None) => { - me.next(); - union_size += 1; - } - _ => break, - }; - } - - Ok((common, union_size as u64)) + Ok(intersection(self.mins.iter(), other.mins.iter())) } } @@ -675,45 +635,7 @@ impl KmerMinHash { Ok((it2.count() as u64, combined_mh.mins.len() as u64)) } else { - // Intersection for scaled MinHash sketches - let mut me = self.mins.iter().peekable(); - let mut other = other.mins.iter().peekable(); - let mut common = 0; - let mut union_size = 0; - - loop { - match (me.peek(), other.peek()) { - (Some(ref left_key), Some(ref right_key)) => { - let res = left_key.cmp(right_key); - match res { - Ordering::Less => { - me.next(); - union_size += 1; - } - Ordering::Greater => { - other.next(); - union_size += 1; - } - Ordering::Equal => { - other.next(); - me.next(); - common += 1; - union_size += 1; - } - }; - } - (None, Some(_)) => { - other.next(); - union_size += 1; - } - (Some(_), None) => { - me.next(); - union_size += 1; - } - _ => break, - }; - } - Ok((common as u64, union_size as u64)) + Ok(intersection_size(self.mins.iter(), other.mins.iter())) } } @@ -1439,54 +1361,63 @@ impl KmerMinHashBTree { pub fn intersection(&self, other: &KmerMinHashBTree) -> Result<(Vec, u64), Error> { self.check_compatible(other)?; - let mut combined_mh = KmerMinHashBTree::new( - self.scaled(), - self.ksize, - self.hash_function, - self.seed, - self.abunds.is_some(), - self.num, - ); + if self.num != 0 { + let mut combined_mh = KmerMinHashBTree::new( + self.scaled(), + self.ksize, + self.hash_function, + self.seed, + self.abunds.is_some(), + self.num, + ); - combined_mh.merge(&self)?; - combined_mh.merge(&other)?; + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; - let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); - // TODO: there is probably a way to avoid this Vec here, - // and pass the it1 as left in it2. - let i1: Vec = it1.cloned().collect(); - let i2: Vec = combined_mh.mins.iter().cloned().collect(); - let it2 = Intersection::new(i1.iter(), i2.iter()); + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let i2: Vec = combined_mh.mins.iter().cloned().collect(); + let it2 = Intersection::new(i1.iter(), i2.iter()); - let common: Vec = it2.cloned().collect(); - Ok((common, combined_mh.mins.len() as u64)) + let common: Vec = it2.cloned().collect(); + Ok((common, combined_mh.mins.len() as u64)) + } else { + // Intersection for scaled MinHash sketches + Ok(intersection(self.mins.iter(), other.mins.iter())) + } } pub fn intersection_size(&self, other: &KmerMinHashBTree) -> Result<(u64, u64), Error> { self.check_compatible(other)?; - let mut combined_mh = KmerMinHashBTree::new( - self.scaled(), - self.ksize, - self.hash_function, - self.seed, - self.abunds.is_some(), - self.num, - ); + if self.num != 0 { + let mut combined_mh = KmerMinHashBTree::new( + self.scaled(), + self.ksize, + self.hash_function, + self.seed, + self.abunds.is_some(), + self.num, + ); - combined_mh.merge(&self)?; - combined_mh.merge(&other)?; + combined_mh.merge(&self)?; + combined_mh.merge(&other)?; - let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); + let it1 = Intersection::new(self.mins.iter(), other.mins.iter()); - // TODO: there is probably a way to avoid this Vec here, - // and pass the it1 as left in it2. - let i1: Vec = it1.cloned().collect(); - let i2: Vec = combined_mh.mins.iter().cloned().collect(); - let it2 = Intersection::new(i1.iter(), i2.iter()); + // TODO: there is probably a way to avoid this Vec here, + // and pass the it1 as left in it2. + let i1: Vec = it1.cloned().collect(); + let i2: Vec = combined_mh.mins.iter().cloned().collect(); + let it2 = Intersection::new(i1.iter(), i2.iter()); - Ok((it2.count() as u64, combined_mh.mins.len() as u64)) + Ok((it2.count() as u64, combined_mh.mins.len() as u64)) + } else { + Ok(intersection_size(self.mins.iter(), other.mins.iter())) + } } // calculate Jaccard similarity, ignoring abundance. @@ -1710,3 +1641,91 @@ impl From for KmerMinHashBTree { new_mh } } + +fn intersection<'a>( + me_iter: impl Iterator, + other_iter: impl Iterator, +) -> (Vec, u64) { + let mut me = me_iter.peekable(); + let mut other = other_iter.peekable(); + let mut common: Vec = vec![]; + let mut union_size = 0; + + loop { + match (me.peek(), other.peek()) { + (Some(ref left_key), Some(ref right_key)) => { + let res = left_key.cmp(right_key); + match res { + Ordering::Less => { + me.next(); + union_size += 1; + } + Ordering::Greater => { + other.next(); + union_size += 1; + } + Ordering::Equal => { + other.next(); + common.push(***left_key); + me.next(); + union_size += 1; + } + }; + } + (None, Some(_)) => { + other.next(); + union_size += 1; + } + (Some(_), None) => { + me.next(); + union_size += 1; + } + _ => break, + }; + } + (common, union_size as u64) +} + +fn intersection_size<'a>( + me_iter: impl Iterator, + other_iter: impl Iterator, +) -> (u64, u64) { + let mut me = me_iter.peekable(); + let mut other = other_iter.peekable(); + let mut common = 0; + let mut union_size = 0; + + loop { + match (me.peek(), other.peek()) { + (Some(ref left_key), Some(ref right_key)) => { + let res = left_key.cmp(right_key); + match res { + Ordering::Less => { + me.next(); + union_size += 1; + } + Ordering::Greater => { + other.next(); + union_size += 1; + } + Ordering::Equal => { + other.next(); + me.next(); + common += 1; + union_size += 1; + } + }; + } + (None, Some(_)) => { + other.next(); + union_size += 1; + } + (Some(_), None) => { + me.next(); + union_size += 1; + } + _ => break, + }; + } + (common as u64, union_size as u64) +}