Skip to content

Commit

Permalink
MRG: use correct denominator in f_unique_to_query (#3138)
Browse files Browse the repository at this point in the history
This PR fixes an issue introduced in #2943 where we introduced a subtly
broken calculation that uses the _current_ size of the query metagenome
as the denominator for the `f_unique_to_query` calculation.

Fixes #3137

This PR also adds some commented-out test code that demonstrates
#3139 /
sourmash-bio/sourmash_plugin_branchwater#322.
That's something I haven't been able to debug, so I'd suggest fixing
that independently - I'd rather fix _a_ problem _now_, rather than
waiting until we can fix _multiple_ problems at some later indeterminate
time :).

## Notes

- [x] do we need to fix same problem in `linear.rs`? or just rename
things per #3137?
- [x] we should add some tests for this
  • Loading branch information
ctb authored May 10, 2024
1 parent 880b488 commit 777fa20
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ pub fn calculate_gather_stats(
// stats for this match vs current (subtracted) query
let f_match = match_size as f64 / match_mh.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * match_size;
let f_unique_to_query = match_size as f64 / query.size() as f64;
let f_unique_to_query = match_size as f64 / orig_query.size() as f64;

// // get ANI values
let ksize = match_mh.ksize() as f64;
Expand Down
119 changes: 118 additions & 1 deletion src/core/src/index/revindex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,6 @@ fn stats_for_cf(db: Arc<DB>, cf_name: &str, deep_check: bool, quick: bool) -> Db

#[cfg(test)]
mod test {

use camino::Utf8PathBuf as PathBuf;
use tempfile::TempDir;

Expand Down Expand Up @@ -590,6 +589,124 @@ mod test {
Ok(())
}

#[test]
fn revindex_load_and_gather_2() -> Result<()> {
let mut basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
basedir.push("../../tests/test-data/gather/");

let against = vec![
"GCF_000006945.2_ASM694v2_genomic.fna.gz.sig",
"GCF_000007545.1_ASM754v1_genomic.fna.gz.sig",
"GCF_000008105.1_ASM810v1_genomic.fna.gz.sig",
"GCF_000008545.1_ASM854v1_genomic.fna.gz.sig",
"GCF_000009085.1_ASM908v1_genomic.fna.gz.sig",
"GCF_000009505.1_ASM950v1_genomic.fna.gz.sig",
"GCF_000009525.1_ASM952v1_genomic.fna.gz.sig",
"GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig",
"GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig",
"GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig",
"GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig",
"GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig",
];
let against: Vec<_> = against
.iter()
.map(|sig| {
let mut filename = basedir.clone();
filename.push(sig);
filename
})
.collect();

// build 'against' sketches into a revindex
let selection = Selection::builder().ksize(21).scaled(10000).build();
let output = TempDir::new()?;

let collection = Collection::from_paths(&against)?.select(&selection)?;
let _index = RevIndex::create(output.path(), collection.try_into()?, false);

let index = RevIndex::open(output.path(), true, None)?;

let mut query = None;
let mut query_filename = basedir.clone();
query_filename.push("combined.sig");
let query_sig = Signature::from_path(query_filename)?
.swap_remove(0)
.select(&selection)?;

if let Some(q) = prepare_query(query_sig, &selection) {
query = Some(q);
}
let query = query.unwrap();

let (counter, query_colors, hash_to_color) = index.prepare_gather_counters(&query);

let matches = index.gather(
counter,
query_colors,
hash_to_color,
0,
&query,
Some(selection),
)?;

// should be 11, based on test_gather_metagenome_num_results @CTB.
// see sourmash#3139 and sourmash_plugin_branchwater#322.
assert_eq!(matches.len(), 6);

fn round5(a: f64) -> f64 {
(a * 1e5).round() / 1e5
}

let match_ = &matches[0];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_003198.1");
assert_eq!(match_.f_match(), 1.0);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.33219645));

let match_ = &matches[1];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_000853.1");
assert_eq!(match_.f_match(), 1.0);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.13096862));

let match_ = &matches[2];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_011978.1");
assert_eq!(match_.f_match(), 0.898936170212766);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.115279));

let match_ = &matches[3];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_002163.1");
assert_eq!(match_.f_match(), 1.0);
assert_eq!(round5(match_.f_unique_to_query()), round5(0.10709413));

let match_ = &matches[4];
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_003197.2");
assert_eq!(round5(match_.f_match()), round5(0.31340206));
assert_eq!(round5(match_.f_unique_to_query()), round5(0.103683));

let match_ = &matches[5];
dbg!(match_);
let names: Vec<&str> = match_.name().split(' ').take(1).collect();
assert_eq!(names[0], "NC_009486.1");

// @CTB this fails: 0.43158 != 0.48421
// assert_eq!(round5(match_.f_match()), round5(0.4842105));

// @CTB this fails: 0.05593 != 0.06276
// assert_eq!(round5(match_.f_unique_to_query()), round5(0.0627557));

// @CTB fails
// assert_eq!(match_.unique_intersect_bp, 820000);

// @CTB fails
// assert_eq!(match_.remaining_bp, 2170000);

Ok(())
}

#[test]
fn revindex_move() -> Result<()> {
let basedir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
Expand Down

0 comments on commit 777fa20

Please sign in to comment.