[MRG] add basic picklist functionality to sourmash sig extract (#1587)

* various cleanups of sourmash_args * cleanup flakes errors * clean up sourmash.sig submodule * initial picklist implementation * integrate picklists into sourmash sig extract * basic tests for picklist functionality * track found etc * split pickfile out a little bit * split column_type out of SignaturePicklist a bit * update comments, constructor, etc. * fix tests :) * more picklist tests * verify output * add --picklist-require-all &c * documentation * test with --md5 selector * cover untested code with tests * trap errors and be nice to users
sourmash-bio · Jun 16, 2021 · b787b75 · b787b75
1 parent ff75ec0
commit b787b75
Show file tree

Hide file tree

Showing 5 changed files with 696 additions and 14 deletions.
diff --git a/doc/command-line.md b/doc/command-line.md
@@ -818,6 +818,40 @@ sourmash signature extract tests/test-data/*.fa.sig --name NC_009665
 will extract the same signature, which has an accession number of
 `NC_009665.1`.
 
+#### Using picklists with `sourmash sig extract`
+
+As of sourmash 4.2.0, `extract` also supports picklists, a feature by
+which you can select signatures based on values in a CSV file.
+
+For example,
+```
+sourmash sig extract --picklist list.csv:md5:md5sum <signatures>
+```
+will extract only the signatures that have md5sums matching the
+column `md5sum` in the CSV file `list.csv`.
+
+The `--picklist` argument string must be of the format
+`pickfile:colname:coltype`, where `pickfile` is the path to a CSV
+file, `colname` is the name of the column to select from the CSV
+file (based on the headers in the first line of the CSV file),
+and `coltype` is the type of match.
+
+The following `coltype`s are currently supported by `sourmash sig extract`:
+
+* `name` - exact match to signature's name
+* `md5` - exact match to signature's md5sum
+* `md5prefix8` - match to 8-character prefix of signature's md5sum
+* `md5short` - same as `md5prefix8`
+* `ident` - exact match to signature's identifier
+* `identprefix` - match to signature's identifier, before '.'
+
+Identifiers are constructed by using the first space delimited word in
+the signature name.
+
+One way to build a picklist is to use `sourmash sig describe --csv
+out.csv <signatures>` to construct an initial CSV file that you can
+then edit further.
+
 ### `sourmash signature flatten` - remove abundance information from signatures
 
 Flatten the specified signature(s), removing abundances and setting

diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py
@@ -25,6 +25,14 @@ def subparser(subparsers):
         '--name', default=None,
         help='select signatures whose name contains this substring'
     )
+    subparser.add_argument(
+        '--picklist', default=None,
+        help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'"
+    )
+    subparser.add_argument(
+        '--picklist-require-all', default=False, action='store_true',
+        help="require that all picklist values be found or else fail"
+    )
     add_ksize_arg(subparser, 31)
     add_moltype_args(subparser)
 

diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py
@@ -13,6 +13,7 @@
 from sourmash.logging import set_quiet, error, notify, print_results, debug
 from sourmash import sourmash_args
 from sourmash.minhash import _get_max_hash_for_scaled
+from .picklist import SignaturePicklist
 
 usage='''
 sourmash signature <command> [<args>] - manipulate/work with signature files.
@@ -188,6 +189,7 @@ def describe(args):
     w = None
     csv_fp = None
     if args.csv:
+        # CTB: might want to switch to sourmash_args.FileOutputCSV here?
         csv_fp = open(args.csv, 'w', newline='')
         w = csv.DictWriter(csv_fp,
                            ['signature_file', 'md5', 'ksize', 'moltype', 'num',
@@ -216,8 +218,10 @@ def describe(args):
                 if mh.track_abundance:
                     with_abundance = 1
                 md5 = sig.md5sum()
-                name = sig.name or "** no name **"
-                filename = sig.filename or "** no name **"
+                name = sig.name
+                p_name = name or "** no name **"
+                filename = sig.filename
+                p_filename = filename or "** no name **"
                 license = sig.license
 
                 if w:
@@ -226,8 +230,8 @@ def describe(args):
                 print_results('''\
 ---
 signature filename: {signature_file}
-signature: {name}
-source file: {filename}
+signature: {p_name}
+source file: {p_filename}
 md5: {md5}
 k={ksize} molecule={moltype} num={num} scaled={scaled} seed={seed} track_abundance={with_abundance}
 size: {n_hashes}
@@ -539,6 +543,48 @@ def extract(args):
     set_quiet(args.quiet)
     moltype = sourmash_args.calculate_moltype(args)
 
+    picklist = None
+    if args.picklist:
+        try:
+            picklist = SignaturePicklist.from_picklist_args(args.picklist)
+        except ValueError as exc:
+            error("ERROR: could not load picklist.")
+            error(str(exc))
+            sys.exit(-1)
+
+        notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'")
+
+        n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name)
+
+        notify(f"loaded {len(picklist.pickset)} distinct values into picklist.")
+        if n_empty_val:
+            notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file")
+        if dup_vals:
+            notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct")
+        picklist_filter_fn = picklist.filter
+    else:
+        def picklist_filter_fn(it):
+            for ss in it:
+                yield ss
+
+    # further filtering on md5 or name?
+    if args.md5 is not None or args.name is not None:
+        def filter_fn(it):
+            for ss in picklist_filter_fn(it):
+                # match?
+                keep = False
+                if args.name and args.name in str(ss):
+                    keep = True
+                if args.md5 and args.md5 in ss.md5sum():
+                    keep = True
+
+                if keep:
+                    yield ss
+    else:
+        # whatever comes out of the picklist is fine
+        filter_fn = picklist_filter_fn
+
+    # ok! filtering defined, let's go forward
     progress = sourmash_args.SignatureLoadingProgress()
 
     save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
@@ -549,26 +595,26 @@ def extract(args):
                                                         ksize=args.ksize,
                                                         select_moltype=moltype,
                                                         progress=progress)
-        siglist = list(siglist)
-
-        # select!
-        if args.md5 is not None:
-            siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
-        if args.name is not None:
-            siglist = [ ss for ss in siglist if args.name in str(ss) ]
-
-        for ss in siglist:
+        for ss in filter_fn(siglist):
             save_sigs.add(ss)
 
     notify(f"loaded {len(progress)} total that matched ksize & molecule type")
     if not save_sigs:
-        error("no matching signatures!")
+        error("no matching signatures to save!")
         sys.exit(-1)
 
     save_sigs.close()
 
     notify("extracted {} signatures from {} file(s)", len(save_sigs),
            len(args.signatures))
+    if picklist:
+        notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values")
+        n_missing = len(picklist.pickset - picklist.found)
+        if n_missing:
+            notify(f"WARNING: {n_missing} missing picklist values.")
+            if args.picklist_require_all:
+                error("ERROR: failing because --picklist-require-all was set")
+                sys.exit(-1)
 
 
 def filter(args):

diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/sig/picklist.py
@@ -0,0 +1,144 @@
+"Picklist code for extracting subsets of signatures."
+import csv
+
+# set up preprocessing functions for column stuff
+preprocess = {}
+
+# exact matches
+preprocess['name'] = lambda x: x
+preprocess['md5'] = lambda x: x
+
+# identifier matches/prefix foo - space delimited identifiers
+preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0]
+preprocess['ident'] = lambda x: x.split(' ')[0]
+
+# match 8 characters
+preprocess['md5prefix8'] = lambda x: x[:8]
+preprocess['md5short'] = lambda x: x[:8]
+
+
+class SignaturePicklist:
+    """Picklist class for subsetting collections of signatures.
+
+    Initialize using ``SignaturePicklist.from_picklist_args(argstr)``,
+    which takes an argument str like so: 'pickfile:column:coltype'.
+
+    # CTB pickfile or pickset?
+    Here, 'pickfile' is the path to a CSV file; 'column' is the name of
+    the column to select from the CSV file; and 'coltype' is the type of
+    matching to do on that column.
+
+    'coltype's that are currently supported:
+    * 'name' - exact match to signature's name
+    * 'md5' - exact match to signature's md5sum
+    * 'md5prefix8' - match to 8-character prefix of signature's md5sum
+    * 'md5short' - same as md5prefix8
+    * 'ident' - exact match to signature's identifier
+    * 'identprefix' - match to signature's identifier, before '.'
+
+    Identifiers are constructed by using the first space delimited word in
+    the signature name.
+    """
+    supported_coltypes = ('md5', 'md5prefix8', 'md5short',
+                          'name', 'ident', 'identprefix')
+
+    def __init__(self, coltype, *, pickfile=None, column_name=None):
+        "create a picklist of column type 'coltype'."
+        self.coltype = coltype
+        self.pickfile = pickfile
+        self.column_name = column_name
+
+        if coltype not in self.supported_coltypes:
+            raise ValueError(f"invalid picklist column type '{coltype}'")
+
+        self.preprocess_fn = preprocess[coltype]
+        self.pickset = None
+        self.found = set()
+        self.n_queries = 0
+
+    @classmethod
+    def from_picklist_args(cls, argstr):
+        "load a picklist from an argument string 'pickfile:column:coltype'"
+        picklist = argstr.split(':')
+        if len(picklist) != 3:
+            raise ValueError(f"invalid picklist argument '{argstr}'")
+
+        assert len(picklist) == 3
+        pickfile, column, coltype = picklist
+
+        return cls(coltype, pickfile=pickfile, column_name=column)
+
+    def _get_sig_attribute(self, ss):
+        "for a given SourmashSignature, return attribute for this picklist."
+        coltype = self.coltype
+        if coltype in ('md5', 'md5prefix8', 'md5short'):
+            q = ss.md5sum()
+        elif coltype in ('name', 'ident', 'identprefix'):
+            q = ss.name
+        else:
+            assert 0
+
+        return q
+
+    def init(self, values=[]):
+        "initialize a Picklist object with given values."
+        if self.pickset is not None:
+            raise ValueError("already initialized?")
+        self.pickset = set(values)
+        return self.pickset
+
+    def load(self, pickfile, column_name):
+        "load pickset, return num empty vals, and set of duplicate vals."
+        pickset = self.init()
+
+        n_empty_val = 0
+        dup_vals = set()
+        with open(pickfile, newline='') as csvfile:
+            r = csv.DictReader(csvfile)
+
+            if column_name not in r.fieldnames:
+                raise ValueError(f"column '{column_name}' not in pickfile '{pickfile}'")
+
+            for row in r:
+                # pick out values from column
+                col = row[column_name]
+                if not col:
+                    n_empty_val += 1
+                    continue
+
+                col = self.preprocess_fn(col)
+
+                # look for duplicate values or empty values
+                if col in pickset:
+                    dup_vals.add(col)
+                else:
+                    self.add(col)
+
+        return n_empty_val, dup_vals
+
+    def add(self, value):
+        "Add a value to this picklist."
+        self.pickset.add(value)
+
+    def __contains__(self, ss):
+        "does this signature match anything in the picklist?"
+        # pull out the relevant signature attribute
+        q = self._get_sig_attribute(ss)
+
+        # mangle into the kinds of values we support here
+        q = self.preprocess_fn(q)
+
+        # add to the number of queries performed,
+        self.n_queries += 1
+
+        # determine if ok or not.
+        if q in self.pickset:
+            self.found.add(q)
+            return True
+        return False
+
+    def filter(self, it):
+        "yield all signatures in the given iterator that are in the picklist"
+        for ss in it:
+            if self.__contains__(ss):
+                yield ss