Skip to content

Commit

Permalink
save to v4, but keep _save_v5 around
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Jul 8, 2019
1 parent 306c890 commit 53fb1de
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 7 deletions.
2 changes: 1 addition & 1 deletion sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ def index(args):
scaleds.add(ss.minhash.scaled)

leaf = SigLeaf(ss.md5sum(), ss)
tree.add_node(leaf, update_internal=False)
tree.add_node(leaf)
n += 1

if not ss:
Expand Down
80 changes: 76 additions & 4 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,82 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
str
full path to the new SBT description
"""
version = 4

if path.endswith('.sbt.json'):
path = path[:-9]
fn = os.path.abspath(path + '.sbt.json')

if storage is None:
# default storage
location = os.path.dirname(fn)
subdir = '.sbt.{}'.format(os.path.basename(path))

storage = FSStorage(location, subdir)
fn = os.path.join(location, fn)

backend = [k for (k, v) in STORAGES.items() if v == type(storage)][0]

info = {}
info['d'] = self.d
info['version'] = version
info['storage'] = {
'backend': backend,
'args': storage.init_args()
}
info['factory'] = {
'class': GraphFactory.__name__,
'args': self.factory.init_args()
}

if not self.is_ready:
self._fill_internal()

nodes = {}
total_nodes = len(self)
for n, (i, node) in enumerate(self):
if node is None:
continue

if isinstance(node, Node):
if random() - sparseness <= 0:
continue

data = {
# TODO: start using md5sum instead?
'filename': os.path.basename(node.name),
'name': node.name
}

try:
node.metadata.pop('max_n_below')
except (AttributeError, KeyError):
pass

data['metadata'] = node.metadata

if structure_only is False:
# trigger data loading before saving to the new place
node.data

node.storage = storage

data['filename'] = node.save(data['filename'])

node.storage = storage
data['filename'] = node.save(data['filename'])
nodes[i] = data

notify("{} of {} nodes saved".format(n+1, total_nodes), end='\r')

notify("\nFinished saving nodes, now saving SBT json file.")
info['nodes'] = nodes
with open(fn, 'w') as fp:
json.dump(info, fp)

return fn

def _save_v5(self, path, storage=None, sparseness=0.0, structure_only=False):
version = 5

if path.endswith('.sbt.json'):
Expand Down Expand Up @@ -671,10 +747,6 @@ def _load_v4(cls, info, leaf_loader, dirname, storage, print_version_warning=Tru

tree.next_node = max_node

if print_version_warning:
error("WARNING: this is an old index version, please run `sourmash migrate` to update it.")
error("WARNING: proceeding with execution, but it will take longer to finish!")

return tree

@classmethod
Expand Down
42 changes: 40 additions & 2 deletions tests/test_sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def test_tree_v1_load():
tree_v1 = SBT.load(utils.get_test_data('v1.sbt.json'),
leaf_loader=SigLeaf.load)

tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'),
tree_cur = SBT.load(utils.get_test_data('v4.sbt.json'),
leaf_loader=SigLeaf.load)

testdata1 = utils.get_test_data(utils.SIG_FILES[0])
Expand All @@ -153,7 +153,45 @@ def test_tree_v2_load():
tree_v2 = SBT.load(utils.get_test_data('v2.sbt.json'),
leaf_loader=SigLeaf.load)

tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'),
tree_cur = SBT.load(utils.get_test_data('v4.sbt.json'),
leaf_loader=SigLeaf.load)

testdata1 = utils.get_test_data(utils.SIG_FILES[0])
to_search = next(signature.load_signatures(testdata1))

results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
to_search, 0.1)}
results_cur = {str(s) for s in tree_cur.find(search_minhashes_containment,
to_search, 0.1)}

assert results_v2 == results_cur
assert len(results_v2) == 4


def test_tree_v3_load():
tree_v2 = SBT.load(utils.get_test_data('v3.sbt.json'),
leaf_loader=SigLeaf.load)

tree_cur = SBT.load(utils.get_test_data('v4.sbt.json'),
leaf_loader=SigLeaf.load)

testdata1 = utils.get_test_data(utils.SIG_FILES[0])
to_search = next(signature.load_signatures(testdata1))

results_v2 = {str(s) for s in tree_v2.find(search_minhashes_containment,
to_search, 0.1)}
results_cur = {str(s) for s in tree_cur.find(search_minhashes_containment,
to_search, 0.1)}

assert results_v2 == results_cur
assert len(results_v2) == 4


def test_tree_v5_load():
tree_v2 = SBT.load(utils.get_test_data('v5.sbt.json'),
leaf_loader=SigLeaf.load)

tree_cur = SBT.load(utils.get_test_data('v4.sbt.json'),
leaf_loader=SigLeaf.load)

testdata1 = utils.get_test_data(utils.SIG_FILES[0])
Expand Down

0 comments on commit 53fb1de

Please sign in to comment.