Skip to content

Commit

Permalink
MRG: transition internal signature loading functions (#3161)
Browse files Browse the repository at this point in the history
Note: PR into #3153

Tackles some signature loading and saving cleanup throughout the
codebase. Most changes are in the tests, and this is a significant
cleanup of the test code!

Fixes #1062.

---

Goals:
* deprecate external use of `sourmash.signature` load/save functions,
because they are JSON-specific and inflexible.
* simplify and standardize signature load/save function usage during
tests;
* get rid of deprecation messages during tests;

In brief,

* rename `sourmash.signature.load_signatures` to
`load_signatures_from_json`;
* rename `sourmash.signature.load_one_signature` to
`load_one_signature_from_json`;
* rename `sourmash.signature.save_signatures` to
`save_signatures_to_json`;
* deprecate `sourmash.save_signatures` and `sourmash.load_one_signature`
for 5.0 (joining `load_signatures`, which was already deprecated);
* reduce/eliminate deprecations by transitioning internal test code to
use these three functions directly from `sourmash.signature` instead of
from the top-level sourmash import.
* **bonus**: eliminate zipfile UserWarning around overwriting files,
which causes lots of warnings when running tests.

---

Done:
- [x] in sourmash.signature submodule, rename `load_signatures` to
`load_signatures_from_json`, `load_one_signature` to
`load_one_signatures_from_json`, and `save_signatures` to
`save_signatures_to_json`; make tests pass.
- [x] deprecate `sourmash.load_one_signature` and
`sourmash.save_signatures`.
- [x] catch zipfile UserWarning for duplicate filenames in
ZipStorage.save

TODO:
- [x] transition internal sourmash code+tests away from deprecated
functions
- [ ] create issue around changing API documentation prior to 5.0;

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
ctb and pre-commit-ci[bot] authored Jun 4, 2024
1 parent 2542c69 commit 9283dc4
Show file tree
Hide file tree
Showing 21 changed files with 686 additions and 614 deletions.
34 changes: 30 additions & 4 deletions src/sourmash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ class MinHash - hash sketch class
MAX_HASH = get_minhash_max_hash()

from .signature import (
load_signatures as load_signatures_private,
load_one_signature,
load_signatures_from_json,
load_one_signature_from_json,
SourmashSignature,
save_signatures,
save_signatures_to_json,
)


Expand All @@ -69,7 +69,33 @@ def load_signatures(*args, **kwargs):
has been removed and the function no longer outputs to stderr.
Moreover, do_raise is now True by default.
"""
return load_signatures_private(*args, **kwargs)
return load_signatures_from_json(*args, **kwargs)


@deprecated(
deprecated_in="4.8.9",
removed_in="5.0",
current_version=VERSION,
details="Use load_file_as_signatures instead.",
)
def load_one_signature(*args, **kwargs):
"""Load a JSON string with signatures into classes.
Returns list of SourmashSignature objects.
Note, the order is not necessarily the same as what is in the source file.
"""
return load_one_signature_from_json(*args, **kwargs)


@deprecated(
deprecated_in="4.8.9",
removed_in="5.0",
current_version=VERSION,
details="use sourmash_args.SaveSignaturesToLocation instead.",
)
def save_signatures(*args, **kwargs):
return save_signatures_to_json(*args, **kwargs)


from .sbtmh import load_sbt_index as load_sbt_index_private
Expand Down
13 changes: 7 additions & 6 deletions src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
)
from sourmash.manifest import CollectionManifest
from sourmash.logging import debug_literal
from sourmash.signature import load_signatures, save_signatures
from sourmash.signature import load_signatures_from_json, save_signatures_to_json

from sourmash.minhash import (
flatten_and_downsample_scaled,
flatten_and_downsample_num,
Expand Down Expand Up @@ -425,12 +426,12 @@ def insert(self, node):

def save(self, path):
with open(path, "w") as fp:
save_signatures(self.signatures(), fp)
save_signatures_to_json(self.signatures(), fp)

@classmethod
def load(cls, location, filename=None):
"Load signatures from a JSON signature file."
si = load_signatures(location, do_raise=True)
si = load_signatures_from_json(location, do_raise=True)

if filename is None:
filename = location
Expand Down Expand Up @@ -639,7 +640,7 @@ def _signatures_with_internal(self):
or self.traverse_yield_all
):
sig_data = self.storage.load(filename)
for ss in load_signatures(sig_data):
for ss in load_signatures_from_json(sig_data):
yield ss, filename

def signatures(self):
Expand All @@ -653,7 +654,7 @@ def signatures(self):
# yield all signatures found in manifest
for filename in manifest.locations():
data = self.storage.load(filename)
for ss in load_signatures(data):
for ss in load_signatures_from_json(data):
# in case multiple signatures are in the file, check
# to make sure we want to return each one.
if ss in manifest:
Expand Down Expand Up @@ -682,7 +683,7 @@ def select(x):
return True

data = self.storage.load(filename)
for ss in load_signatures(data):
for ss in load_signatures_from_json(data):
if select(ss):
yield ss

Expand Down
13 changes: 7 additions & 6 deletions src/sourmash/save_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,8 @@ def _get_signatures_from_rust(siglist):
# minhash (and hence one md5sum) per signature, while
# Rust supports multiple. For now, go through serializing
# and deserializing the signature! See issue #1167 for more.
json_str = sourmash.save_signatures(siglist)
yield from sourmash.signature.load_signatures(json_str)
json_str = sigmod.save_signatures_to_json(siglist)
yield from sigmod.load_signatures_from_json(json_str)


class SaveSignatures_NoOutput(Base_SaveSignaturesToLocation):
Expand Down Expand Up @@ -362,7 +362,7 @@ def add(self, ss):
i += 1

with open(outname, "wb") as fp:
sigmod.save_signatures([ss], fp, compression=1)
sigmod.save_signatures_to_json([ss], fp, compression=1)


class SaveSignatures_SqliteIndex(Base_SaveSignaturesToLocation):
Expand Down Expand Up @@ -425,7 +425,7 @@ def open(self):

def close(self):
if self.location == "-":
sourmash.save_signatures(self.keep, sys.stdout)
sigmod.save_signatures_to_json(self.keep, sys.stdout)
else:
# text mode? encode in utf-8
mode = "w"
Expand All @@ -437,7 +437,7 @@ def close(self):
mode = "wb"

with open(self.location, mode, encoding=encoding) as fp:
sourmash.save_signatures(self.keep, fp, compression=self.compress)
sigmod.save_signatures_to_json(self.keep, fp, compression=self.compress)

def add(self, ss):
super().add(ss)
Expand Down Expand Up @@ -470,6 +470,7 @@ def close(self):
manifest_data = manifest_fp.getvalue().encode("utf-8")

self.storage.save(manifest_name, manifest_data, overwrite=True, compress=True)

self.storage.flush()
self.storage.close()

Expand Down Expand Up @@ -523,7 +524,7 @@ def add(self, add_sig):
raise ValueError("this output is not open")

for ss in _get_signatures_from_rust([add_sig]):
buf = sigmod.save_signatures([ss], compression=1)
buf = sigmod.save_signatures_to_json([ss], compression=1)
md5 = ss.md5sum()

storage = self.storage
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def signatures(self):
if self.manifest:
# if manifest, use it & load using direct path to storage.
# this will be faster when using picklists.
from .signature import load_one_signature
from .signature import load_one_signature_from_json

manifest = self.manifest

Expand All @@ -175,7 +175,7 @@ def signatures(self):
buf = self.storage.load(loc)
# if more than one signature can be in a file, we need
# to recheck picklists here.
ss = load_one_signature(buf)
ss = load_one_signature_from_json(buf)
yield ss
else:
# no manifest? iterate over all leaves.
Expand Down
47 changes: 26 additions & 21 deletions src/sourmash/sbt_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import zipfile
from abc import ABC
from pathlib import Path
import warnings

from ._lowlevel import ffi, lib
from .utils import RustObject, rustcall, decode_str
Expand Down Expand Up @@ -279,27 +280,31 @@ def _write_to_zf(self, zf, path, content, *, compress=False):
zi.external_attr = perms

def save(self, path, content, *, overwrite=False, compress=False):
# First try to save to self.zipfile, if it is not writable
# or would introduce duplicates then try to save it in the buffer
if overwrite:
newpath = path
do_write = True
else:
newpath, do_write = self._generate_filename(self.zipfile, path, content)
if do_write:
try:
self._write_to_zf(self.zipfile, newpath, content, compress=compress)
except (ValueError, RuntimeError):
# Can't write in the zipfile, write in buffer instead
# CTB: do we need to generate a new filename wrt to the
# bufferzip, too? Not sure this code is working as intended...
if self.bufferzip:
self._write_to_zf(
self.bufferzip, newpath, content, compress=compress
)
else:
# Throw error, can't write the data
raise ValueError("can't write data")
# ignore UserWarnings for duplicate filenames.
with warnings.catch_warnings():
warnings.simplefilter("ignore")

# First try to save to self.zipfile, if it is not writable
# or would introduce duplicates then try to save it in the buffer
if overwrite:
newpath = path
do_write = True
else:
newpath, do_write = self._generate_filename(self.zipfile, path, content)
if do_write:
try:
self._write_to_zf(self.zipfile, newpath, content, compress=compress)
except (ValueError, RuntimeError):
# Can't write in the zipfile, write in buffer instead
# CTB: do we need to generate a new filename wrt to the
# bufferzip, too? Not sure this code is working as intended...
if self.bufferzip:
self._write_to_zf(
self.bufferzip, newpath, content, compress=compress
)
else:
# Throw error, can't write the data
raise ValueError("can't write data")

return newpath

Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/sbtmh.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def save(self, path):
# content...)
self.data

buf = signature.save_signatures([self.data], compression=1)
buf = signature.save_signatures_to_json([self.data], compression=1)
return self.storage.save(path, buf)

def update(self, parent):
Expand All @@ -70,7 +70,7 @@ def update(self, parent):
def data(self):
if self._data is None:
buf = BytesIO(self.storage.load(self._path))
self._data = signature.load_one_signature(buf)
self._data = signature.load_one_signature_from_json(buf)
return self._data

@data.setter
Expand Down
10 changes: 6 additions & 4 deletions src/sourmash/signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def _detect_input_type(data):
return SigInput.UNKNOWN


def load_signatures(
def load_signatures_from_json(
data,
ksize=None,
select_moltype=None,
Expand Down Expand Up @@ -469,8 +469,10 @@ def load_signatures(
raise


def load_one_signature(data, ksize=None, select_moltype=None, ignore_md5sum=False):
sigiter = load_signatures(
def load_one_signature_from_json(
data, ksize=None, select_moltype=None, ignore_md5sum=False
):
sigiter = load_signatures_from_json(
data, ksize=ksize, select_moltype=select_moltype, ignore_md5sum=ignore_md5sum
)

Expand All @@ -487,7 +489,7 @@ def load_one_signature(data, ksize=None, select_moltype=None, ignore_md5sum=Fals
raise ValueError("expected to load exactly one signature")


def save_signatures(siglist, fp=None, compression=0):
def save_signatures_to_json(siglist, fp=None, compression=0):
"Save multiple signatures into a JSON string (or into file handle 'fp')"
attached_refs = weakref.WeakKeyDictionary()

Expand Down
Loading

0 comments on commit 9283dc4

Please sign in to comment.