diff --git a/doc/command-line.md b/doc/command-line.md
index ecbd311840..54e323419e 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -1621,3 +1621,68 @@ sig` commands will output to stdout. So, for example,
`sourmash sketch ... -o - | sourmash sig describe -` will describe the
signatures that were just created.
+
+### Using manifests to explicitly refer to collections of files
+
+(sourmash v4.4.0 and later)
+
+Manifests are metadata catalogs of signatures that are used for
+signature selection and loading. They are used extensively by sourmash
+internals to speed up signature selection through picklists and
+pattern matching.
+
+Manifests can _also_ be used externally (via the command-line), and
+may be useful for organizing large collections of signatures.
+
+Suppose you have a large collection of signature (`.sig` or `.sig.gz`
+files) under a directory. You can create a manifest file for them like so:
+```
+sourmash sig manifest
-o /manifest.csv
+```
+and then use the manifest directly for sourmash operations:
+```
+sourmash sig fileinfo /manifest.csv
+```
+This manifest can be used as a database target for most sourmash
+operations - search, gather, etc. Note that manifests for directories
+must be placed within (and loaded from) the directory from which the
+manifest was generated; the specific manifest filename does not
+matter.
+
+A more advanced and slightly tricky way to use explicit manifest files
+is with lists of files. If you create a file with a path list
+containing the locations of loadable sourmash collections, you can run
+`sourmash sig manifest pathlist.txt -o mf.csv` to generate a manifest
+of all of the files. The resulting manifest in `mf.csv` can then be
+loaded directly. This is very handy when you have many sourmash
+signatures, or large signature files. The tricky part in doing this
+is that the manifest will store the same paths listed in the pathlist
+file - whether they are relative or absolute paths - and these paths
+must be resolvable by sourmash from the current working directory.
+This makes explicit manifests built from pathlist files less portable
+within or across systems than the other sourmash collections, which
+are all relocatable.
+
+For example, if you create a pathlist file `paths.txt` containing the
+following:
+```
+/path/to/zipfile.zip
+local_directory/some_signature.sig.gz
+local_dir2/
+```
+and then run:
+```
+sourmash sig manifest paths.txt -o mf.csv
+```
+you will be able to use `mf.csv` as a database for `sourmash search`
+and `sourmash gather` commands. But, because it contains two relative paths,
+you will only be able to use it _from the directory that contains those
+two relative paths_.
+
+**Our advice:** We suggest using zip file collections for most
+situations; we primarily recommend using explicit manifests for
+situations where you have a **very large** collection of signatures
+(1000s or more), and don't want to make multiple copies of signatures
+in the collection (as you would have to, with a zipfile). This can be
+useful if you want to refer to different subsets of the collection
+without making multiple copies in a zip file.
diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py
index f126981a6d..97a9d35e17 100644
--- a/src/sourmash/index/__init__.py
+++ b/src/sourmash/index/__init__.py
@@ -25,10 +25,14 @@
ZipFileLinearIndex - simple on-disk storage of signatures.
-class MultiIndex - in-memory storage and selection of signatures from multiple
-index objects, using manifests.
+MultiIndex - in-memory storage and selection of signatures from multiple
+index objects, using manifests. All signatures are kept in memory.
+
+StandaloneManifestIndex - load manifests directly, and do lazy loading of
+signatures on demand. No signatures are kept in memory.
LazyLoadedIndex - selection on manifests with loading of index on demand.
+(Consider using StandaloneManifestIndex instead.)
CounterGather - an ancillary class returned by the 'counter_gather()' method.
"""
@@ -39,6 +43,7 @@ class MultiIndex - in-memory storage and selection of signatures from multiple
from collections import namedtuple, Counter
import csv
from io import TextIOWrapper
+from collections import defaultdict
from ..search import make_jaccard_search_query, make_gather_query
from ..manifest import CollectionManifest
@@ -49,7 +54,12 @@ class MultiIndex - in-memory storage and selection of signatures from multiple
IndexSearchResult = namedtuple('Result', 'score, signature, location')
class Index(ABC):
+ # this will be removed soon; see sourmash#1894.
is_database = False
+
+ # 'manifest', when set, implies efficient selection and direct
+ # access to signatures. Signatures may be stored in the manifest
+ # or loaded on demand from disk depending on the class, however.
manifest = None
@abstractmethod
@@ -933,6 +943,11 @@ def sigloc_iter():
# build manifest; note, signatures are stored in memory.
# CTB: could do this on demand?
+ # CTB: should we use get_manifest functionality?
+ # CTB: note here that the manifest is created by iteration
+ # *even if it already exists.* This could be changed to be more
+ # efficient... but for now, use StandaloneManifestIndex if you
+ # want to avoid this when loading from multiple files.
manifest = CollectionManifest.create_manifest(sigloc_iter())
# create!
@@ -945,6 +960,8 @@ def load_from_directory(cls, pathname, *, force=False):
Takes directory path plus optional boolean 'force'. Attempts to
load all files ending in .sig or .sig.gz, by default; if 'force' is
True, will attempt to load _all_ files, ignoring errors.
+
+ Will not load anything other than JSON signature files.
"""
from ..sourmash_args import traverse_find_sigs
@@ -1007,8 +1024,8 @@ def load_from_path(cls, pathname, force=False):
def load_from_pathlist(cls, filename):
"""Create a MultiIndex from all files listed in a text file.
- Note: this will load signatures from directories and databases, too,
- if they are listed in the text file; it uses 'load_file_as_index'
+ Note: this will attempt to load signatures from each file,
+ including zip collections, etc; it uses 'load_file_as_index'
underneath.
"""
from ..sourmash_args import (load_pathlist_from_file,
@@ -1047,6 +1064,8 @@ class LazyLoadedIndex(Index):
from disk every time they are needed (e.g. 'find(...)', 'signatures()').
Wrapper class; signatures dynamically loaded from disk; uses manifests.
+
+ CTB: This may be redundant with StandaloneManifestIndex.
"""
def __init__(self, filename, manifest):
"Create an Index with given filename and manifest."
@@ -1139,3 +1158,126 @@ def select(self, **kwargs):
new_manifest = manifest.select_to_manifest(**kwargs)
return LazyLoadedIndex(self.filename, new_manifest)
+
+
+class StandaloneManifestIndex(Index):
+ """Load a standalone manifest as an Index.
+
+ This class is useful for the situation where you have a directory
+ with many signature collections underneath it, and you don't want to load
+ every collection each time you run sourmash.
+
+ Instead, you can run 'sourmash sig manifest -o mf.csv' to
+ output a manifest and then use this class to load 'mf.csv' directly.
+ Sketch type selection, picklists, and pattern matching will all work
+ directly on the manifest and will load signatures only upon demand.
+
+ One feature of this class is that absolute paths to sketches in
+ the 'internal_location' field of the manifests will be loaded properly.
+ This permits manifests to be constructed for various collections of
+ signatures that reside elsewhere, and not just below a single directory
+ prefix.
+
+ StandaloneManifestIndex does _not_ store signatures in memory.
+
+ This class overlaps in concept with LazyLoadedIndex and behaves
+ identically when a manifest contains only rows from a single
+ on-disk Index object. However, unlike LazyLoadedIndex, this class
+ can be used to reference multiple on-disk Index objects.
+
+ This class also overlaps in concept with MultiIndex when
+ MultiIndex.load_from_pathlist is used to load other Index
+ objects. However, this class does not store any signatures in
+ memory, unlike MultiIndex.
+ """
+ is_database = True
+
+ def __init__(self, manifest, location, *, prefix=None):
+ """Create object. 'location' is path of manifest file, 'prefix' is
+ prepended to signature paths when loading non-abspaths."""
+ assert manifest is not None
+ self.manifest = manifest
+ self._location = location
+ self.prefix = prefix
+
+ @classmethod
+ def load(cls, location, *, prefix=None):
+ """Load manifest file from given location.
+
+ If prefix is None (default), it is automatically set from dirname.
+ Set prefix='' to avoid this, or provide an explicit prefix.
+ """
+ if not os.path.isfile(location):
+ raise ValueError(f"provided manifest location '{location}' is not a file")
+
+ with open(location, newline='') as fp:
+ m = CollectionManifest.load_from_csv(fp)
+
+ if prefix is None:
+ prefix = os.path.dirname(location)
+
+ return cls(m, location, prefix=prefix)
+
+ @property
+ def location(self):
+ "Return the path to this manifest."
+ return self._location
+
+ def signatures_with_location(self):
+ "Return an iterator over all signatures and their locations."
+ for ss, loc in self._signatures_with_internal():
+ yield ss, loc
+
+ def signatures(self):
+ "Return an iterator over all signatures."
+ for ss, loc in self._signatures_with_internal():
+ yield ss
+
+ def _signatures_with_internal(self):
+ """Return an iterator over all sigs of (sig, internal_location)
+
+ Note that this is implemented differently from most Index
+ objects in that it only lists subselected parts of the
+ manifest, and not the original manifest. This was done out of
+ convenience: we don't currently have access to the original
+ manifest in this class.
+ """
+ # collect all internal locations
+ iloc_to_rows = defaultdict(list)
+ for row in self.manifest.rows:
+ iloc = row['internal_location']
+ iloc_to_rows[iloc].append(row)
+
+ # iterate over internal locations, selecting relevant sigs
+ for iloc, iloc_rows in iloc_to_rows.items():
+ # prepend with prefix?
+ if not iloc.startswith('/') and self.prefix:
+ iloc = os.path.join(self.prefix, iloc)
+
+ sub_mf = CollectionManifest(iloc_rows)
+ picklist = sub_mf.to_picklist()
+
+ idx = sourmash.load_file_as_index(iloc)
+ idx = idx.select(picklist=picklist)
+ for ss in idx.signatures():
+ yield ss, iloc
+
+ def __len__(self):
+ "Number of signatures in this manifest (after any select)."
+ return len(self.manifest)
+
+ def __bool__(self):
+ "Is this manifest empty?"
+ return bool(self.manifest)
+
+ def save(self, *args):
+ raise NotImplementedError
+
+ def insert(self, *args):
+ raise NotImplementedError
+
+ def select(self, **kwargs):
+ "Run 'select' on the manifest."
+ new_manifest = self.manifest.select_to_manifest(**kwargs)
+ return StandaloneManifestIndex(new_manifest, self._location,
+ prefix=self.prefix)
diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py
index 6b5eb09eaf..fb9119def4 100644
--- a/src/sourmash/lca/lca_db.py
+++ b/src/sourmash/lca/lca_db.py
@@ -58,6 +58,10 @@ class LCA_Database(Index):
"""
is_database = True
+ # we set manifest to None to avoid implication of fast on-disk access to
+ # sketches. This may be revisited later.
+ manifest = None
+
def __init__(self, ksize, scaled, moltype='DNA'):
self.ksize = int(ksize)
self.scaled = int(scaled)
diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py
index 25372cea02..36d9300e07 100644
--- a/src/sourmash/sourmash_args.py
+++ b/src/sourmash/sourmash_args.py
@@ -364,6 +364,12 @@ def _load_stdin(filename, **kwargs):
return db
+def _load_standalone_manifest(filename, **kwargs):
+ from sourmash.index import StandaloneManifestIndex
+ idx = StandaloneManifestIndex.load(filename)
+ return idx
+
+
def _multiindex_load_from_pathlist(filename, **kwargs):
"Load collection from a list of signature/database files"
db = MultiIndex.load_from_pathlist(filename)
@@ -416,6 +422,7 @@ def _load_zipfile(filename, **kwargs):
# all loader functions, in order.
_loader_functions = [
("load from stdin", _load_stdin),
+ ("load from standalone manifest", _load_standalone_manifest),
("load from path (file or directory)", _multiindex_load_from_path),
("load from file list", _multiindex_load_from_pathlist),
("load SBT", _load_sbt),
diff --git a/tests/test-data/scaled/mf.csv b/tests/test-data/scaled/mf.csv
new file mode 100644
index 0000000000..e3ff4d09e7
--- /dev/null
+++ b/tests/test-data/scaled/mf.csv
@@ -0,0 +1,17 @@
+# SOURMASH-MANIFEST-VERSION: 1.0
+internal_location,md5,md5short,ksize,moltype,num,scaled,n_hashes,with_abundance,name,filename
+all.lca.json,455c2f95f2d0a95e176870659119f170,455c2f95,31,DNA,0,10000,93,0,,
+all.lca.json,684aa226f843eaa7e1e40fc5603d5f2a,684aa226,31,DNA,0,10000,48,0,,
+all.lca.json,7f7835d2dd27ba703e843eee4757f3c2,7f7835d2,31,DNA,0,10000,8,0,,
+all.lca.json,7ffcfaa4027d4153a991b6bd78cf39fe,7ffcfaa4,31,DNA,0,10000,45,0,,
+all.lca.json,d84ef28f610b1783f801734699cf7e40,d84ef28f,31,DNA,0,10000,45,0,,
+genome-s10+s11.fa.gz.sig,455c2f95f2d0a95e176870659119f170,455c2f95,31,DNA,0,10000,93,0,,../genome-s10+s11.fa.gz
+genome-s11.fa.gz.sig,7ffcfaa4027d4153a991b6bd78cf39fe,7ffcfaa4,31,DNA,0,10000,45,0,,../genome-s11.fa.gz
+all.sbt.zip,684aa226f843eaa7e1e40fc5603d5f2a,684aa226,31,DNA,0,10000,48,0,,../genome-s10.fa.gz
+all.sbt.zip,7f7835d2dd27ba703e843eee4757f3c2,7f7835d2,31,DNA,0,10000,8,0,,../genome-s10-small.fa.gz
+all.sbt.zip,7ffcfaa4027d4153a991b6bd78cf39fe,7ffcfaa4,31,DNA,0,10000,45,0,,../genome-s11.fa.gz
+all.sbt.zip,455c2f95f2d0a95e176870659119f170,455c2f95,31,DNA,0,10000,93,0,,../genome-s10+s11.fa.gz
+all.sbt.zip,d84ef28f610b1783f801734699cf7e40,d84ef28f,31,DNA,0,10000,45,0,,../genome-s12.fa.gz
+genome-s10-small.fa.gz.sig,7f7835d2dd27ba703e843eee4757f3c2,7f7835d2,31,DNA,0,10000,8,0,,../genome-s10-small.fa.gz
+genome-s12.fa.gz.sig,d84ef28f610b1783f801734699cf7e40,d84ef28f,31,DNA,0,10000,45,0,,../genome-s12.fa.gz
+genome-s10.fa.gz.sig,684aa226f843eaa7e1e40fc5603d5f2a,684aa226,31,DNA,0,10000,48,0,,../genome-s10.fa.gz
diff --git a/tests/test-data/scaled/pathlist.txt b/tests/test-data/scaled/pathlist.txt
new file mode 100644
index 0000000000..32b8b3bacd
--- /dev/null
+++ b/tests/test-data/scaled/pathlist.txt
@@ -0,0 +1,7 @@
+all.lca.json
+all.sbt.zip
+genome-s10+s11.fa.gz.sig
+genome-s10-small.fa.gz.sig
+genome-s10.fa.gz.sig
+genome-s11.fa.gz.sig
+genome-s12.fa.gz.sig
diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
index 2168cd9555..d588171f20 100644
--- a/tests/test_cmd_signature.py
+++ b/tests/test_cmd_signature.py
@@ -3376,6 +3376,31 @@ def test_sig_describe_2_exclude_db_pattern(runtmp):
assert line.strip() in out
+def test_sig_describe_3_manifest_works(runtmp):
+ # test on a manifest with relative paths, in proper location
+ mf = utils.get_test_data('scaled/mf.csv')
+ runtmp.sourmash('sig', 'describe', mf, '--csv', 'out.csv')
+
+ out = runtmp.last_result.out
+ print(out)
+
+ with open(runtmp.output('out.csv'), newline='') as fp:
+ r = csv.reader(fp)
+ rows = list(r)
+ assert len(rows) == 16 # 15 signatures, plus head
+
+
+def test_sig_describe_3_manifest_fails_when_moved(runtmp):
+ # test on a manifest with relative paths, when in wrong place;
+ # should fail, because actual signatures cannot be loaded now.
+ # note: this tests lazy loading.
+ mf = utils.get_test_data('scaled/mf.csv')
+ shutil.copyfile(mf, runtmp.output('mf.csv'))
+
+ with pytest.raises(SourmashCommandFailed):
+ runtmp.sourmash('sig', 'describe', 'mf.csv')
+
+
@utils.in_tempdir
def test_sig_overlap(c):
# get overlap details
@@ -3566,6 +3591,13 @@ def test_sig_manifest_6_pathlist(runtmp):
assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list
assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list
+ # note: the manifest output for pathlists will contain the locations
+ # used in the pathlist. This is required by StandaloneManifestIndex.
+ for row in manifest.rows:
+ iloc = row['internal_location']
+ print(iloc)
+ assert iloc.startswith('/'), iloc
+
def test_sig_manifest_does_not_exist(runtmp):
with pytest.raises(SourmashCommandFailed):
diff --git a/tests/test_cmd_signature_fileinfo.py b/tests/test_cmd_signature_fileinfo.py
index a0847fe97a..ee90fc7ba4 100644
--- a/tests/test_cmd_signature_fileinfo.py
+++ b/tests/test_cmd_signature_fileinfo.py
@@ -330,3 +330,37 @@ def test_sig_fileinfo_does_not_exist(runtmp):
runtmp.run_sourmash('sig', 'fileinfo', 'does-not-exist')
assert "Cannot open 'does-not-exist' as a sourmash signature collection" in runtmp.last_result.err
+
+
+def test_sig_fileinfo_8_manifest_works(runtmp):
+ # test on a manifest with relative paths, in proper location
+ mf = utils.get_test_data('scaled/mf.csv')
+ runtmp.sourmash('sig', 'fileinfo', mf)
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out
+ assert 'num signatures: 15' in out
+ assert 'has manifest? yes' in out
+ assert 'is database? yes' in out
+ assert 'path filetype: StandaloneManifestIndex' in out
+
+
+def test_sig_fileinfo_8_manifest_works_when_moved(runtmp):
+ # test on a manifest with relative paths, when in wrong place
+ # note: this works, unlike 'describe', because all the necessary info
+ # for 'fileinfo' is in the manifest.
+ mf = utils.get_test_data('scaled/mf.csv')
+ shutil.copyfile(mf, runtmp.output('mf.csv'))
+
+ runtmp.sourmash('sig', 'fileinfo', 'mf.csv')
+
+ out = runtmp.last_result.out
+ print(out)
+
+ assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out
+ assert 'num signatures: 15' in out
+ assert 'has manifest? yes' in out
+ assert 'is database? yes' in out
+ assert 'path filetype: StandaloneManifestIndex' in out
diff --git a/tests/test_index.py b/tests/test_index.py
index 95eebc6d34..d361517d59 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -13,7 +13,8 @@
from sourmash import load_one_signature, SourmashSignature
from sourmash.index import (LinearIndex, ZipFileLinearIndex,
make_jaccard_search_query, CounterGather,
- LazyLinearIndex, MultiIndex)
+ LazyLinearIndex, MultiIndex,
+ StandaloneManifestIndex)
from sourmash.index.revindex import RevIndex
from sourmash.sbt import SBT, GraphFactory, Leaf
from sourmash.sbtmh import SigLeaf
@@ -21,6 +22,7 @@
from sourmash.search import JaccardSearch, SearchType
from sourmash.picklist import SignaturePicklist, PickStyle
from sourmash_tst_utils import SourmashCommandFailed
+from sourmash.manifest import CollectionManifest
import sourmash_tst_utils as utils
@@ -2388,6 +2390,7 @@ def test_lazy_loaded_index_3_find(runtmp):
x = list(x)
assert len(x) == 0
+
def test_revindex_index_search():
sig2 = utils.get_test_data("2.fa.sig")
sig47 = utils.get_test_data("47.fa.sig")
@@ -2485,3 +2488,258 @@ def is_found(ss, xx):
assert not is_found(ss47, results)
assert not is_found(ss2, results)
assert is_found(ss63, results)
+
+
+def test_standalone_manifest_signatures(runtmp):
+ # build a StandaloneManifestIndex and test 'signatures' method.
+
+ ## first, build a manifest in memory using MultiIndex
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ ss47 = sourmash.load_one_signature(sig47)
+ ss63 = sourmash.load_one_signature(sig63)
+
+ lidx1 = LinearIndex.load(sig47)
+ lidx2 = LinearIndex.load(sig63)
+
+ mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "")
+
+ ## got a manifest! ok, now test out StandaloneManifestIndex
+ mm = StandaloneManifestIndex(mi.manifest, None)
+
+ siglist = [ ss for ss in mm.signatures() ]
+ assert len(siglist) == 2
+ assert ss47 in siglist
+ assert ss63 in siglist
+
+
+def test_standalone_manifest_signatures_prefix(runtmp):
+ # try out 'prefix' for StandaloneManifestIndex
+
+ ## first, build a manifest in memory using MultiIndex
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ ss47 = sourmash.load_one_signature(sig47)
+ ss63 = sourmash.load_one_signature(sig63)
+
+ lidx1 = LinearIndex.load(sig47)
+ lidx2 = LinearIndex.load(sig63)
+ mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "")
+
+ # ok, now remove the abspath prefix from iloc
+ for row in mi.manifest.rows:
+ row['internal_location'] = os.path.basename(row['internal_location'])
+
+ ## this should succeed!
+ mm = StandaloneManifestIndex(mi.manifest, None,
+ prefix=utils.get_test_data(''))
+
+ assert len(list(mm.signatures())) == 2
+
+
+def test_standalone_manifest_signatures_prefix_fail(runtmp):
+ # give StandaloneManifest the wrong prefix
+
+ ## first, build a manifest in memory using MultiIndex
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ ss47 = sourmash.load_one_signature(sig47)
+ ss63 = sourmash.load_one_signature(sig63)
+
+ lidx1 = LinearIndex.load(sig47)
+ lidx2 = LinearIndex.load(sig63)
+ print('XXX', lidx1.location)
+
+ mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "")
+
+ # remove prefix from manifest
+ for row in mi.manifest.rows:
+ row['internal_location'] = os.path.basename(row['internal_location'])
+
+ ## got a manifest! ok, now test out StandaloneManifestIndex
+ mm = StandaloneManifestIndex(mi.manifest, None, prefix='foo')
+
+ # should fail
+ with pytest.raises(ValueError) as exc:
+ list(mm.signatures())
+
+ assert "Error while reading signatures from 'foo/47.fa.sig'" in str(exc)
+
+
+def test_standalone_manifest_load_from_dir(runtmp):
+ # test loading a mf with relative directory paths from test-data
+ mf = utils.get_test_data('scaled/mf.csv')
+ idx = sourmash.load_file_as_index(mf)
+
+ siglist = list(idx.signatures())
+ assert len(siglist) == 15
+
+ assert idx # should be 'True'
+ assert len(idx) == 15
+
+ with pytest.raises(NotImplementedError):
+ idx.insert()
+
+ with pytest.raises(NotImplementedError):
+ idx.save('foo')
+
+ assert idx.location == mf
+
+
+def test_standalone_manifest_lazy_load(runtmp):
+ # check that it's actually doing lazy loading
+ orig_sig47 = utils.get_test_data('47.fa.sig')
+ sig47 = runtmp.output('47.fa.sig')
+
+ # build an external manifest
+ shutil.copyfile(orig_sig47, sig47)
+
+ # this is an abspath to sig47
+ runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf.csv')
+
+ # should work to get signatures:
+ idx = StandaloneManifestIndex.load(runtmp.output('mf.csv'))
+
+ siglist = list(idx.signatures())
+ assert len(siglist) == 1
+
+ # now remove!
+ os.unlink(sig47)
+
+ # can still access manifest...
+ assert len(idx) == 1
+
+ # ...but we should get an error when we call signatures.
+ with pytest.raises(ValueError):
+ list(idx.signatures())
+
+ # but put it back, and all is forgiven. yay!
+ shutil.copyfile(orig_sig47, sig47)
+ x = list(idx.signatures())
+ assert len(x) == 1
+
+
+def test_standalone_manifest_lazy_load_2_prefix(runtmp):
+ # check that it's actually doing lazy loading; supply explicit prefix
+ orig_sig47 = utils.get_test_data('47.fa.sig')
+ sig47 = runtmp.output('47.fa.sig')
+
+ # build an external manifest
+ # note, here use a relative path to 47.fa.sig; the manifest will contain
+ # just '47.fa.sig' as the location
+ shutil.copyfile(orig_sig47, sig47)
+ runtmp.sourmash('sig', 'manifest', '47.fa.sig', '-o', 'mf.csv')
+
+ # should work to get signatures:
+ idx = StandaloneManifestIndex.load(runtmp.output('mf.csv'),
+ prefix=runtmp.output(''))
+
+ siglist = list(idx.signatures())
+ assert len(siglist) == 1
+
+ # now remove!
+ os.unlink(sig47)
+
+ # can still access manifest...
+ assert len(idx) == 1
+
+ # ...but we should get an error when we call signatures.
+ with pytest.raises(ValueError):
+ list(idx.signatures())
+
+ # but put it back, and all is forgiven. yay!
+ shutil.copyfile(orig_sig47, sig47)
+ x = list(idx.signatures())
+ assert len(x) == 1
+
+
+def test_standalone_manifest_search(runtmp):
+ # test a straight up 'search'
+ query_sig = utils.get_test_data('scaled/genome-s12.fa.gz.sig')
+ mf = utils.get_test_data('scaled/mf.csv')
+
+ runtmp.sourmash('search', query_sig, mf)
+
+ out = runtmp.last_result.out
+ print(out)
+ assert '100.0% d84ef28f' in out
+
+
+def test_standalone_manifest_prefetch_lazy(runtmp):
+ # check that prefetch is actually doing lazy loading on manifest index.
+ orig_sig47 = utils.get_test_data('47.fa.sig')
+ sig47 = runtmp.output('47.fa.sig')
+ orig_sig2 = utils.get_test_data('2.fa.sig')
+ sig2 = runtmp.output('2.fa.sig')
+ orig_sig63 = utils.get_test_data('63.fa.sig')
+ sig63 = runtmp.output('63.fa.sig')
+
+ shutil.copyfile(orig_sig47, sig47)
+ runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf1.csv')
+ shutil.copyfile(orig_sig2, sig2)
+ runtmp.sourmash('sig', 'manifest', sig2, '-o', 'mf2.csv')
+ shutil.copyfile(orig_sig63, sig63)
+ runtmp.sourmash('sig', 'manifest', sig63, '-o', 'mf3.csv')
+
+ # combine the manifests, manually for now...
+ with open(runtmp.output('mf1.csv'), newline='') as fp:
+ mf1 = CollectionManifest.load_from_csv(fp)
+ assert len(mf1) == 1
+
+ with open(runtmp.output('mf2.csv'), newline='') as fp:
+ mf2 = CollectionManifest.load_from_csv(fp)
+ assert len(mf2) == 3
+
+ with open(runtmp.output('mf3.csv'), newline='') as fp:
+ mf3 = CollectionManifest.load_from_csv(fp)
+ assert len(mf3) == 1
+
+ all_rows = list(mf1.rows) + list(mf2.rows) + list(mf3.rows)
+ print(all_rows)
+ mf = CollectionManifest(all_rows)
+ assert len(mf) == 5
+ with open(runtmp.output('mf.csv'), 'w', newline='') as fp:
+ mf.write_to_csv(fp, write_header=True)
+
+ # ok! now, remove the last signature, 'sig63'.
+ os.unlink(sig63)
+
+ # ...but loading the manifest should still work.
+ idx = StandaloneManifestIndex.load(runtmp.output('mf.csv'))
+
+ # double check - third load will fail. this relies on load order :shrug:.
+ sig_iter = iter(idx.signatures())
+ ss = next(sig_iter)
+ print(ss)
+ assert '47.fa' in ss.filename
+
+ for i in range(3):
+ ss = next(sig_iter)
+ print(i, ss)
+ assert '2.fa' in ss.filename
+
+ with pytest.raises(ValueError) as exc:
+ ss = next(sig_iter)
+ assert 'Error while reading signatures from' in str(exc)
+ assert '63.fa.sig' in str(exc)
+
+ # ok! now test prefetch... should get one match legit, to 47,
+ # and then no matches to 2, and then error.
+
+ ss47 = sourmash.load_one_signature(sig47)
+ idx = idx.select(ksize=31)
+ g = idx.prefetch(ss47, threshold_bp=0)
+
+ # first value:
+ sr = next(g)
+ assert sr.signature == ss47
+
+ # second value should raise error.
+ with pytest.raises(ValueError) as exc:
+ sr = next(g)
+
+ assert 'Error while reading signatures from' in str(exc)
+ assert '63.fa.sig' in str(exc)
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 9f483f9a1e..b775bdf193 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5196,3 +5196,58 @@ def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather):
assert "1.0 kbp 100.0% 100.0%" in runtmp.last_result.out
assert "found 1 matches total;" in runtmp.last_result.out
+
+
+def test_standalone_manifest_search(runtmp):
+ # test loading/searching a manifest file from the command line.
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ dirname = runtmp.output('somedir')
+ os.mkdir(dirname)
+ subdir = runtmp.output('somedir/subdir')
+ os.mkdir(subdir)
+ shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig'))
+ shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig'))
+
+ # for now, the output manifest must be within top level dir for
+ # CLI stuff to work properly.
+ mf = os.path.join(dirname, 'mf.csv')
+
+ # build manifest...
+ runtmp.sourmash('sig', 'manifest', dirname, '-o', mf)
+
+ # ...and now use for a search!
+ runtmp.sourmash('search', sig47, mf)
+
+ out = runtmp.last_result.out
+ print(out)
+ print(runtmp.last_result.err)
+
+ assert "100.0% NC_009665.1 Shewanella baltica OS185, complete genome" in out
+
+
+def test_standalone_manifest_search_fail(runtmp):
+ # test loading/searching a manifest file from the command line; should
+ # fail if manifest is not located within tld.
+ sig47 = utils.get_test_data('47.fa.sig')
+ sig63 = utils.get_test_data('63.fa.sig')
+
+ dirname = runtmp.output('somedir')
+ os.mkdir(dirname)
+ subdir = runtmp.output('somedir/subdir')
+ os.mkdir(subdir)
+ shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig'))
+ shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig'))
+
+ # for now, the output manifest must be within top level dir for
+ # CLI stuff to work properly. here we intentionally break this,
+ # for testing purposes.
+ mf = runtmp.output('mf.csv')
+
+ # build manifest...
+ runtmp.sourmash('sig', 'manifest', dirname, '-o', mf)
+
+ # ...and now use for a search!
+ with pytest.raises(SourmashCommandFailed):
+ runtmp.sourmash('search', sig47, mf)