Skip to content

Commit

Permalink
Merge branch 'latest' of https://github.com/dib-lab/sourmash into add…
Browse files Browse the repository at this point in the history
…/merge_name
  • Loading branch information
hehouts committed Apr 29, 2021
2 parents fd1e6f9 + 8fbf83f commit fcdfce2
Show file tree
Hide file tree
Showing 11 changed files with 202 additions and 38 deletions.
14 changes: 14 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
version: 2
updates:
- package-ecosystem: pip
directory: "/"
schedule:
interval: daily
time: "13:00"
open-pull-requests-limit: 10
- package-ecosystem: cargo
directory: "/"
schedule:
interval: daily
time: "13:00"
open-pull-requests-limit: 10
4 changes: 2 additions & 2 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ def prepare_query(query_mh, subj_mh):
if search_fn.passes(score):
# note: here we yield the original signature, not the
# downsampled minhash.
search_fn.collect(score)
yield subj, score
if search_fn.collect(score, subj):
yield subj, score

def search_abund(self, query, *, threshold=None, **kwargs):
"""Return set of matches with angular similarity above 'threshold'.
Expand Down
9 changes: 7 additions & 2 deletions src/sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,9 +462,14 @@ def find(self, search_fn, query, **kwargs):

score = search_fn.score_fn(query_size, shared_size, subj_size,
total_size)

# note to self: even with JaccardSearchBestOnly, this will
# still iterate over & score all signatures. We should come
# up with a protocol by which the JaccardSearch object can
# signal that it is done, or something.
if search_fn.passes(score):
search_fn.collect(score)
yield subj, score
if search_fn.collect(score, subj):
yield subj, score

@cached_property
def lid_to_idx(self):
Expand Down
9 changes: 6 additions & 3 deletions src/sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,9 +436,12 @@ def node_search(node, *args, **kwargs):

if search_fn.passes(score):
if is_leaf: # terminal node? keep.
results[node.data] = score
search_fn.collect(score)
return True
if search_fn.collect(score, node.data):
results[node.data] = score
return True
else: # it's a good internal node, keep.
return True

return False

# & execute!
Expand Down
20 changes: 14 additions & 6 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def make_gather_query(query_mh, threshold_bp):
if threshold > 1.0:
return None

search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT, threshold=threshold)
search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT,
threshold=threshold)

return search_obj

Expand Down Expand Up @@ -111,14 +112,20 @@ def check_is_compatible(self, sig):
raise TypeError("this search cannot be done with an abund signature")

def passes(self, score):
"Return True if this score meets or exceeds the threshold."
"""Return True if this score meets or exceeds the threshold.
Note: this can be used whenever a score or estimate is available
(e.g. internal nodes on an SBT). `collect(...)`, below, decides
whether a particular signature should be collected, and/or can
update the threshold (used for BestOnly behavior).
"""
if score and score >= self.threshold:
return True
return False

def collect(self, score):
"Is this a potential match?"
pass
def collect(self, score, match_sig):
"Return True if this match should be collected."
return True

def score_jaccard(self, query_size, shared_size, subject_size, total_size):
"Calculate Jaccard similarity."
Expand All @@ -142,9 +149,10 @@ def score_max_containment(self, query_size, shared_size, subject_size,

class JaccardSearchBestOnly(JaccardSearch):
"A subclass of JaccardSearch that implements best-only."
def collect(self, score):
def collect(self, score, match):
"Raise the threshold to the best match found so far."
self.threshold = max(self.threshold, score)
return True


# generic SearchResult tuple.
Expand Down
16 changes: 13 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
import os

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

from hypothesis import settings, Verbosity
import pytest

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

from sourmash_tst_utils import TempDirectory, RunnerContext


@pytest.fixture
def runtmp():
with TempDirectory() as location:
yield RunnerContext(location)


@pytest.fixture
def run():
yield RunnerContext(os.getcwd())


@pytest.fixture(params=[True, False])
def track_abundance(request):
Expand Down
7 changes: 2 additions & 5 deletions tests/sourmash_tst_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"Various utilities used by sourmash tests."

import sys
import os
import tempfile
Expand All @@ -12,10 +11,7 @@
from pkg_resources import Requirement, resource_filename, ResolutionError
import traceback
from io import open # pylint: disable=redefined-builtin
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
from io import StringIO


SIG_FILES = [os.path.join('demo', f) for f in (
Expand Down Expand Up @@ -193,6 +189,7 @@ def run_sourmash(self, *args, **kwargs):
raise ValueError(self)

return self.last_result
sourmash = run_sourmash

def run(self, scriptname, *args, **kwargs):
"Run a script with the given arguments."
Expand Down
126 changes: 126 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sourmash.sbt import SBT, GraphFactory, Leaf
from sourmash.sbtmh import SigLeaf
from sourmash import sourmash_args
from sourmash.search import JaccardSearch, SearchType

import sourmash_tst_utils as utils

Expand Down Expand Up @@ -1081,3 +1082,128 @@ def test_multi_index_load_from_pathlist_3_zipfile(c):

mi = MultiIndex.load_from_pathlist(file_list)
assert len(mi) == 7

##
## test a slightly outre version of JaccardSearch - this is a test of the
## JaccardSearch 'collect' protocol, in particular...
##

class JaccardSearchBestOnly_ButIgnore(JaccardSearch):
"A class that ignores certain results, but still does all the pruning."
def __init__(self, ignore_list):
super().__init__(SearchType.JACCARD, threshold=0.1)
self.ignore_list = ignore_list

# a collect function that _ignores_ things in the ignore_list
def collect(self, score, match):
print('in collect; current threshold:', self.threshold)
for q in self.ignore_list:
print('ZZZ', match, match.similarity(q))
if match.similarity(q) == 1.0:
print('yes, found.')
return False

# update threshold if not perfect match, which could help prune.
self.threshold = score
return True


def test_linear_index_gather_ignore():
sig2 = utils.get_test_data('2.fa.sig')
sig47 = utils.get_test_data('47.fa.sig')
sig63 = utils.get_test_data('63.fa.sig')

ss2 = sourmash.load_one_signature(sig2, ksize=31)
ss47 = sourmash.load_one_signature(sig47, ksize=31)
ss63 = sourmash.load_one_signature(sig63, ksize=31)

# construct an index...
lidx = LinearIndex([ss2, ss47, ss63])

# ...now search with something that should ignore sig47, the exact match.
search_fn = JaccardSearchBestOnly_ButIgnore([ss47])

results = list(lidx.find(search_fn, ss47))
results = [ ss for (ss, score) in results ]

def is_found(ss, xx):
for q in xx:
print(ss, ss.similarity(q))
if ss.similarity(q) == 1.0:
return True
return False

assert not is_found(ss47, results)
assert not is_found(ss2, results)
assert is_found(ss63, results)


def test_lca_index_gather_ignore():
from sourmash.lca import LCA_Database

sig2 = utils.get_test_data('2.fa.sig')
sig47 = utils.get_test_data('47.fa.sig')
sig63 = utils.get_test_data('63.fa.sig')

ss2 = sourmash.load_one_signature(sig2, ksize=31)
ss47 = sourmash.load_one_signature(sig47, ksize=31)
ss63 = sourmash.load_one_signature(sig63, ksize=31)

# construct an index...
db = LCA_Database(ksize=31, scaled=1000)
db.insert(ss2)
db.insert(ss47)
db.insert(ss63)

# ...now search with something that should ignore sig47, the exact match.
search_fn = JaccardSearchBestOnly_ButIgnore([ss47])

results = list(db.find(search_fn, ss47))
results = [ ss for (ss, score) in results ]

def is_found(ss, xx):
for q in xx:
print(ss, ss.similarity(q))
if ss.similarity(q) == 1.0:
return True
return False

assert not is_found(ss47, results)
assert not is_found(ss2, results)
assert is_found(ss63, results)


def test_sbt_index_gather_ignore():
sig2 = utils.get_test_data('2.fa.sig')
sig47 = utils.get_test_data('47.fa.sig')
sig63 = utils.get_test_data('63.fa.sig')

ss2 = sourmash.load_one_signature(sig2, ksize=31)
ss47 = sourmash.load_one_signature(sig47, ksize=31)
ss63 = sourmash.load_one_signature(sig63, ksize=31)

# construct an index...
factory = GraphFactory(5, 100, 3)
db = SBT(factory, d=2)

db.insert(ss2)
db.insert(ss47)
db.insert(ss63)

# ...now search with something that should ignore sig47, the exact match.
print(f'\n** trying to ignore {ss47}')
search_fn = JaccardSearchBestOnly_ButIgnore([ss47])

results = list(db.find(search_fn, ss47))
results = [ ss for (ss, score) in results ]

def is_found(ss, xx):
for q in xx:
print('is found?', ss, ss.similarity(q))
if ss.similarity(q) == 1.0:
return True
return False

assert not is_found(ss47, results)
assert not is_found(ss2, results)
assert is_found(ss63, results)
4 changes: 2 additions & 2 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,13 @@ def test_score_jaccard_max_containment_zero_query_size():

def test_collect():
search_obj = make_jaccard_search_query(threshold=0)
search_obj.collect(1.0)
search_obj.collect(1.0, None)
assert search_obj.threshold == 0


def test_collect_best_only():
search_obj = make_jaccard_search_query(threshold=0, best_only=True)
search_obj.collect(1.0)
search_obj.collect(1.0, None)
assert search_obj.threshold == 1.0


Expand Down
27 changes: 12 additions & 15 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,15 +859,14 @@ def test_gather_query_db_md5_ambiguous(c):
assert "Error! Multiple signatures start with md5 '1'" in err


@utils.in_tempdir
def test_gather_lca_db(c):
def test_gather_lca_db(runtmp):
# can we do a 'sourmash gather' on an LCA database?
query = utils.get_test_data('47+63.fa.sig')
lca_db = utils.get_test_data('lca/47+63.lca.json')

c.run_sourmash('gather', query, lca_db)
print(c)
assert 'NC_009665.1 Shewanella baltica OS185' in str(c.last_result.out)
runtmp.sourmash('gather', query, lca_db)
print(runtmp)
assert 'NC_009665.1 Shewanella baltica OS185' in str(runtmp.last_result.out)


@utils.in_tempdir
Expand Down Expand Up @@ -1443,19 +1442,18 @@ def test_search_containment_s10():
assert '16.7%' in out


@utils.in_thisdir
def test_search_containment_s10_no_max(c):
def test_search_containment_s10_no_max(run):
# check --containment for s10/s10-small
q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig')
q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig')

with pytest.raises(ValueError) as exc:
c.run_sourmash('search', q1, q2, '--containment',
run.run_sourmash('search', q1, q2, '--containment',
'--max-containment')

print(c.last_result.out)
print(c.last_result.err)
assert "ERROR: cannot specify both --containment and --max-containment!" in c.last_result.err
print(run.last_result.out)
print(run.last_result.err)
assert "ERROR: cannot specify both --containment and --max-containment!" in run.last_result.err


def test_search_max_containment_s10_pairwise():
Expand Down Expand Up @@ -4118,12 +4116,11 @@ def test_gather_abund_10_1_ignore_abundance(c):
some_results = False
for row in r:
some_results = True
assert row['average_abund'] is ''
assert row['median_abund'] is ''
assert row['std_abund'] is ''
assert row['average_abund'] == ''
assert row['median_abund'] == ''
assert row['std_abund'] == ''

assert some_results



@utils.in_tempdir
Expand Down
Loading

0 comments on commit fcdfce2

Please sign in to comment.