Skip to content

Commit

Permalink
[MRG] add basic picklist functionality to sourmash sig extract (#1587)
Browse files Browse the repository at this point in the history
* various cleanups of sourmash_args

* cleanup flakes errors

* clean up sourmash.sig submodule

* initial picklist implementation

* integrate picklists into sourmash sig extract

* basic tests for picklist functionality

* track found etc

* split pickfile out a little bit

* split column_type out of SignaturePicklist a bit

* update comments, constructor, etc.

* fix tests :)

* more picklist tests

* verify output

* add --picklist-require-all &c

* documentation

* test with --md5 selector

* cover untested code with tests

* trap errors and be nice to users
  • Loading branch information
ctb authored Jun 16, 2021
1 parent ff75ec0 commit b787b75
Show file tree
Hide file tree
Showing 5 changed files with 696 additions and 14 deletions.
34 changes: 34 additions & 0 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,40 @@ sourmash signature extract tests/test-data/*.fa.sig --name NC_009665
will extract the same signature, which has an accession number of
`NC_009665.1`.

#### Using picklists with `sourmash sig extract`

As of sourmash 4.2.0, `extract` also supports picklists, a feature by
which you can select signatures based on values in a CSV file.

For example,
```
sourmash sig extract --picklist list.csv:md5:md5sum <signatures>
```
will extract only the signatures that have md5sums matching the
column `md5sum` in the CSV file `list.csv`.

The `--picklist` argument string must be of the format
`pickfile:colname:coltype`, where `pickfile` is the path to a CSV
file, `colname` is the name of the column to select from the CSV
file (based on the headers in the first line of the CSV file),
and `coltype` is the type of match.

The following `coltype`s are currently supported by `sourmash sig extract`:

* `name` - exact match to signature's name
* `md5` - exact match to signature's md5sum
* `md5prefix8` - match to 8-character prefix of signature's md5sum
* `md5short` - same as `md5prefix8`
* `ident` - exact match to signature's identifier
* `identprefix` - match to signature's identifier, before '.'

Identifiers are constructed by using the first space delimited word in
the signature name.

One way to build a picklist is to use `sourmash sig describe --csv
out.csv <signatures>` to construct an initial CSV file that you can
then edit further.

### `sourmash signature flatten` - remove abundance information from signatures

Flatten the specified signature(s), removing abundances and setting
Expand Down
8 changes: 8 additions & 0 deletions src/sourmash/cli/sig/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ def subparser(subparsers):
'--name', default=None,
help='select signatures whose name contains this substring'
)
subparser.add_argument(
'--picklist', default=None,
help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'"
)
subparser.add_argument(
'--picklist-require-all', default=False, action='store_true',
help="require that all picklist values be found or else fail"
)
add_ksize_arg(subparser, 31)
add_moltype_args(subparser)

Expand Down
74 changes: 60 additions & 14 deletions src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sourmash.logging import set_quiet, error, notify, print_results, debug
from sourmash import sourmash_args
from sourmash.minhash import _get_max_hash_for_scaled
from .picklist import SignaturePicklist

usage='''
sourmash signature <command> [<args>] - manipulate/work with signature files.
Expand Down Expand Up @@ -188,6 +189,7 @@ def describe(args):
w = None
csv_fp = None
if args.csv:
# CTB: might want to switch to sourmash_args.FileOutputCSV here?
csv_fp = open(args.csv, 'w', newline='')
w = csv.DictWriter(csv_fp,
['signature_file', 'md5', 'ksize', 'moltype', 'num',
Expand Down Expand Up @@ -216,8 +218,10 @@ def describe(args):
if mh.track_abundance:
with_abundance = 1
md5 = sig.md5sum()
name = sig.name or "** no name **"
filename = sig.filename or "** no name **"
name = sig.name
p_name = name or "** no name **"
filename = sig.filename
p_filename = filename or "** no name **"
license = sig.license

if w:
Expand All @@ -226,8 +230,8 @@ def describe(args):
print_results('''\
---
signature filename: {signature_file}
signature: {name}
source file: {filename}
signature: {p_name}
source file: {p_filename}
md5: {md5}
k={ksize} molecule={moltype} num={num} scaled={scaled} seed={seed} track_abundance={with_abundance}
size: {n_hashes}
Expand Down Expand Up @@ -539,6 +543,48 @@ def extract(args):
set_quiet(args.quiet)
moltype = sourmash_args.calculate_moltype(args)

picklist = None
if args.picklist:
try:
picklist = SignaturePicklist.from_picklist_args(args.picklist)
except ValueError as exc:
error("ERROR: could not load picklist.")
error(str(exc))
sys.exit(-1)

notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'")

n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name)

notify(f"loaded {len(picklist.pickset)} distinct values into picklist.")
if n_empty_val:
notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file")
if dup_vals:
notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct")
picklist_filter_fn = picklist.filter
else:
def picklist_filter_fn(it):
for ss in it:
yield ss

# further filtering on md5 or name?
if args.md5 is not None or args.name is not None:
def filter_fn(it):
for ss in picklist_filter_fn(it):
# match?
keep = False
if args.name and args.name in str(ss):
keep = True
if args.md5 and args.md5 in ss.md5sum():
keep = True

if keep:
yield ss
else:
# whatever comes out of the picklist is fine
filter_fn = picklist_filter_fn

# ok! filtering defined, let's go forward
progress = sourmash_args.SignatureLoadingProgress()

save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
Expand All @@ -549,26 +595,26 @@ def extract(args):
ksize=args.ksize,
select_moltype=moltype,
progress=progress)
siglist = list(siglist)

# select!
if args.md5 is not None:
siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
if args.name is not None:
siglist = [ ss for ss in siglist if args.name in str(ss) ]

for ss in siglist:
for ss in filter_fn(siglist):
save_sigs.add(ss)

notify(f"loaded {len(progress)} total that matched ksize & molecule type")
if not save_sigs:
error("no matching signatures!")
error("no matching signatures to save!")
sys.exit(-1)

save_sigs.close()

notify("extracted {} signatures from {} file(s)", len(save_sigs),
len(args.signatures))
if picklist:
notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values")
n_missing = len(picklist.pickset - picklist.found)
if n_missing:
notify(f"WARNING: {n_missing} missing picklist values.")
if args.picklist_require_all:
error("ERROR: failing because --picklist-require-all was set")
sys.exit(-1)


def filter(args):
Expand Down
144 changes: 144 additions & 0 deletions src/sourmash/sig/picklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"Picklist code for extracting subsets of signatures."
import csv

# set up preprocessing functions for column stuff
preprocess = {}

# exact matches
preprocess['name'] = lambda x: x
preprocess['md5'] = lambda x: x

# identifier matches/prefix foo - space delimited identifiers
preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0]
preprocess['ident'] = lambda x: x.split(' ')[0]

# match 8 characters
preprocess['md5prefix8'] = lambda x: x[:8]
preprocess['md5short'] = lambda x: x[:8]


class SignaturePicklist:
"""Picklist class for subsetting collections of signatures.
Initialize using ``SignaturePicklist.from_picklist_args(argstr)``,
which takes an argument str like so: 'pickfile:column:coltype'.
# CTB pickfile or pickset?
Here, 'pickfile' is the path to a CSV file; 'column' is the name of
the column to select from the CSV file; and 'coltype' is the type of
matching to do on that column.
'coltype's that are currently supported:
* 'name' - exact match to signature's name
* 'md5' - exact match to signature's md5sum
* 'md5prefix8' - match to 8-character prefix of signature's md5sum
* 'md5short' - same as md5prefix8
* 'ident' - exact match to signature's identifier
* 'identprefix' - match to signature's identifier, before '.'
Identifiers are constructed by using the first space delimited word in
the signature name.
"""
supported_coltypes = ('md5', 'md5prefix8', 'md5short',
'name', 'ident', 'identprefix')

def __init__(self, coltype, *, pickfile=None, column_name=None):
"create a picklist of column type 'coltype'."
self.coltype = coltype
self.pickfile = pickfile
self.column_name = column_name

if coltype not in self.supported_coltypes:
raise ValueError(f"invalid picklist column type '{coltype}'")

self.preprocess_fn = preprocess[coltype]
self.pickset = None
self.found = set()
self.n_queries = 0

@classmethod
def from_picklist_args(cls, argstr):
"load a picklist from an argument string 'pickfile:column:coltype'"
picklist = argstr.split(':')
if len(picklist) != 3:
raise ValueError(f"invalid picklist argument '{argstr}'")

assert len(picklist) == 3
pickfile, column, coltype = picklist

return cls(coltype, pickfile=pickfile, column_name=column)

def _get_sig_attribute(self, ss):
"for a given SourmashSignature, return attribute for this picklist."
coltype = self.coltype
if coltype in ('md5', 'md5prefix8', 'md5short'):
q = ss.md5sum()
elif coltype in ('name', 'ident', 'identprefix'):
q = ss.name
else:
assert 0

return q

def init(self, values=[]):
"initialize a Picklist object with given values."
if self.pickset is not None:
raise ValueError("already initialized?")
self.pickset = set(values)
return self.pickset

def load(self, pickfile, column_name):
"load pickset, return num empty vals, and set of duplicate vals."
pickset = self.init()

n_empty_val = 0
dup_vals = set()
with open(pickfile, newline='') as csvfile:
r = csv.DictReader(csvfile)

if column_name not in r.fieldnames:
raise ValueError(f"column '{column_name}' not in pickfile '{pickfile}'")

for row in r:
# pick out values from column
col = row[column_name]
if not col:
n_empty_val += 1
continue

col = self.preprocess_fn(col)

# look for duplicate values or empty values
if col in pickset:
dup_vals.add(col)
else:
self.add(col)

return n_empty_val, dup_vals

def add(self, value):
"Add a value to this picklist."
self.pickset.add(value)

def __contains__(self, ss):
"does this signature match anything in the picklist?"
# pull out the relevant signature attribute
q = self._get_sig_attribute(ss)

# mangle into the kinds of values we support here
q = self.preprocess_fn(q)

# add to the number of queries performed,
self.n_queries += 1

# determine if ok or not.
if q in self.pickset:
self.found.add(q)
return True
return False

def filter(self, it):
"yield all signatures in the given iterator that are in the picklist"
for ss in it:
if self.__contains__(ss):
yield ss
Loading

0 comments on commit b787b75

Please sign in to comment.