diff --git a/.gitignore b/.gitignore
index b763147ae1..db06c2c4ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@ sourmash/_minhash.cpp
 .pytest_cache
 .python-version
 sourmash/version.py
+*.DS_Store
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 72aafe064e..49e9b80133 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,10 +45,10 @@ needletail = { version = "~0.2.1", optional = true }
 serde = "1.0"
 serde_derive = "~1.0.58"
 serde_json = "1.0.2"
-ukhs = "0.3.4"
+ukhs = { git = "https://github.com/luizirber/ukhs", branch = "feature/alternative_backends", features = ["boomphf_mphf"], default-features = false}
 bio = { git = "https://github.com/luizirber/rust-bio", branch = "feature/fastx_reader" }
 primal = "0.2.3"
-pdatastructs = "0.5.0"
+pdatastructs = { git = "https://github.com/luizirber/pdatastructs.rs", branch = "succinct_wasm" }
 itertools = "0.8.0"
 typed-builder = "0.3.0"
 csv = "1.0.7"
diff --git a/Makefile b/Makefile
index c51b7ffcd0..079c6e2213 100644
--- a/Makefile
+++ b/Makefile
@@ -25,7 +25,7 @@ doc: .PHONY
 coverage: all
 	$(PYTHON) setup.py clean --all
 	SOURMASH_COVERAGE=1 $(PYTHON) setup.py build_ext -i
-	$(PYTHON) -m pytest --cov=.
+	$(PYTHON) -m pytest --cov=. --cov-report term-missing
 
 benchmark: all
 	asv continuous master
diff --git a/README.md b/README.md
index 91fcb39bfc..e2d6619cb3 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ sourmash is a product of the
 We recommend using bioconda to install sourmash:
 
 ```
-conda install sourmash
+conda install -c conda-forge -c bioconda sourmash
 ```
 This will install the latest stable version of sourmash 2.
 
diff --git a/doc/_static/cmp.matrix.png b/doc/_static/cmp.matrix.png
index 931d452170..31791c3f9f 100644
Binary files a/doc/_static/cmp.matrix.png and b/doc/_static/cmp.matrix.png differ
diff --git a/doc/command-line.md b/doc/command-line.md
index 69658f60d7..1755ce60dc 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -20,12 +20,12 @@ taken.
 Grab three bacterial genomes from NCBI:
 ```
 curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Escherichia_coli/reference/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz
-curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Salmonella_enterica/reference/GCF_000006945.1_ASM694v1/GCF_000006945.1_ASM694v1_genomic.fna.gz
+curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Salmonella_enterica/reference/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.fna.gz
 curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Sphingobacteriaceae_bacterium_DW12/latest_assembly_versions/GCF_000783305.1_ASM78330v1/GCF_000783305.1_ASM78330v1_genomic.fna.gz
 ```
 Compute signatures for each:
 ```
-   sourmash compute *.fna.gz
+   sourmash compute -k 31 *.fna.gz
 ```
 This will produce three `.sig` files containing MinHash signatures at k=31.
 
@@ -33,9 +33,16 @@ Next, compare all the signatures to each other:
 ```
 sourmash compare *.sig -o cmp
 ```
+
+Optionally, parallelize compare to 8 threads with `-p 8`:
+
+```
+sourmash compare -p 8 *.sig -o cmp
+```
+
 Finally, plot a dendrogram:
 ```
-sourmash plot cmp
+sourmash plot cmp --labels
 ```
 This will output two files, `cmp.dendro.png` and `cmp.matrix.png`,
 containing a clustering & dendrogram of the sequences, as well as a
diff --git a/doc/tutorial-basic.md b/doc/tutorial-basic.md
index 8b2d200cae..8a4bf51eda 100644
--- a/doc/tutorial-basic.md
+++ b/doc/tutorial-basic.md
@@ -185,6 +185,12 @@ Compare all the things:
 sourmash compare ecoli_many_sigs/* -o ecoli_cmp
 ```
 
+Optionally, parallelize to 8 threads using `-p 8`:
+
+```
+sourmash compare -p 8 ecoli_many_sigs/* -o ecoli_cmp
+```
+
 and then plot:
 
 ```
diff --git a/setup.py b/setup.py
index 259da9b0de..e47182bfbd 100644
--- a/setup.py
+++ b/setup.py
@@ -69,10 +69,10 @@
                        'setuptools_scm', 'setuptools_scm_git_archive'],
     "use_scm_version": {"write_to": "sourmash/version.py"},
     "extras_require": {
-        'test' : ['pytest', 'pytest-cov', 'numpy', 'matplotlib', 'scipy','recommonmark'],
+        'test' : ['pytest', 'pytest-cov', 'numpy', 'matplotlib', 'scipy', 'recommonmark'],
         'demo' : ['jupyter', 'jupyter_client', 'ipython'],
         'doc' : ['sphinx'],
-        '10x': ['pathos', 'bamnostic>=0.9.2'],
+        '10x': ['pathos', 'pysam']
         },
     "include_package_data": True,
     "package_data": {
diff --git a/sourmash/_minhash.pxd b/sourmash/_minhash.pxd
index 1d6fe3a2dd..5b0aaf9e3f 100644
--- a/sourmash/_minhash.pxd
+++ b/sourmash/_minhash.pxd
@@ -25,15 +25,18 @@ cdef extern from "kmer_min_hash.hh":
         const unsigned int num;
         const unsigned int ksize;
         const bool is_protein;
+        const bool dayhoff;
         const HashIntoType max_hash;
         CMinHashType mins;
 
-        KmerMinHash(unsigned int, unsigned int, bool, uint32_t, HashIntoType)
+        KmerMinHash(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType)
         void add_hash(HashIntoType) except +ValueError
         void remove_hash(HashIntoType) except +ValueError
         void add_word(string word) except +ValueError
         void add_sequence(const char *, bool) except +ValueError
         void merge(const KmerMinHash&) except +ValueError
+        string aa_to_dayhoff(string aa) except +ValueError
+        string translate_codon(string codon) except +ValueError
         unsigned int count_common(const KmerMinHash&) except +ValueError
         unsigned long size()
 
@@ -41,13 +44,15 @@ cdef extern from "kmer_min_hash.hh":
     cdef cppclass KmerMinAbundance(KmerMinHash):
         CMinHashType abunds;
 
-        KmerMinAbundance(unsigned int, unsigned int, bool, uint32_t, HashIntoType)
+        KmerMinAbundance(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType)
         void add_hash(HashIntoType) except +ValueError
         void remove_hash(HashIntoType) except +ValueError
         void add_word(string word) except +ValueError
         void add_sequence(const char *, bool) except +ValueError
         void merge(const KmerMinAbundance&) except +ValueError
         void merge(const KmerMinHash&) except +ValueError
+        string aa_to_dayhoff(string aa) except +ValueError
+        string translate_codon(string codon) except +ValueError
         unsigned int count_common(const KmerMinAbundance&) except +ValueError
         unsigned long size()
 
diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx
index af9471c305..beaff29f1c 100644
--- a/sourmash/_minhash.pyx
+++ b/sourmash/_minhash.pyx
@@ -45,11 +45,15 @@ def get_scaled_for_max_hash(max_hash):
 
 
 cdef bytes to_bytes(s):
-    if not isinstance(s, (basestring, bytes)):
+    # Allow for strings, bytes or int
+    # Single item of byte string = int
+    if not isinstance(s, (basestring, bytes, int)):
         raise TypeError("Requires a string-like sequence")
 
     if isinstance(s, unicode):
         s = s.encode('utf-8')
+    if isinstance(s, int):
+        s = bytes([s])
     return s
 
 
@@ -88,6 +92,7 @@ cdef class MinHash(object):
 
     def __init__(self, unsigned int n, unsigned int ksize,
                        bool is_protein=False,
+                       bool dayhoff=False,
                        bool track_abundance=False,
                        uint32_t seed=MINHASH_DEFAULT_SEED,
                        HashIntoType max_hash=0,
@@ -107,9 +112,9 @@ cdef class MinHash(object):
 
         cdef KmerMinHash *mh = NULL
         if track_abundance:
-            mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash)
+            mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash)
         else:
-            mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash)
+            mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash)
 
         self._this.reset(mh)
 
@@ -122,7 +127,8 @@ cdef class MinHash(object):
 
     def __copy__(self):
         a = MinHash(deref(self._this).num, deref(self._this).ksize,
-                    deref(self._this).is_protein, self.track_abundance,
+                    deref(self._this).is_protein, deref(self._this).dayhoff,
+                    self.track_abundance,
                     deref(self._this).seed, deref(self._this).max_hash)
         a.merge(self)
         return a
@@ -135,23 +141,24 @@ cdef class MinHash(object):
         return (deref(self._this).num,
                 deref(self._this).ksize,
                 deref(self._this).is_protein,
+                deref(self._this).dayhoff,
                 self.get_mins(with_abundance=with_abundance),
                 None, self.track_abundance, deref(self._this).max_hash,
                 deref(self._this).seed)
 
     def __setstate__(self, tup):
-        (n, ksize, is_protein, mins, _, track_abundance, max_hash, seed) =\
+        (n, ksize, is_protein, dayhoff, mins, _, track_abundance, max_hash, seed) =\
           tup
 
         self.track_abundance = track_abundance
 
         cdef KmerMinHash *mh = NULL
         if track_abundance:
-            mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash)
+            mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash)
             self._this.reset(mh)
             self.set_abundances(mins)
         else:
-            mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash)
+            mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash)
             self._this.reset(mh)
             self.add_many(mins)
 
@@ -160,6 +167,7 @@ cdef class MinHash(object):
                (deref(self._this).num,
                 deref(self._this).ksize,
                 deref(self._this).is_protein,
+                deref(self._this).dayhoff,
                 self.track_abundance,
                 deref(self._this).seed,
                 deref(self._this).max_hash,
@@ -173,7 +181,8 @@ cdef class MinHash(object):
 
     def copy_and_clear(self):
         a = MinHash(deref(self._this).num, deref(self._this).ksize,
-                    deref(self._this).is_protein, self.track_abundance,
+                    deref(self._this).is_protein, deref(self._this).dayhoff,
+                    self.track_abundance,
                     deref(self._this).seed, deref(self._this).max_hash)
         return a
 
@@ -234,6 +243,10 @@ cdef class MinHash(object):
     def is_protein(self):
         return deref(self._this).is_protein
 
+    @property
+    def dayhoff(self):
+        return deref(self._this).dayhoff
+
     @property
     def ksize(self):
         return deref(self._this).ksize
@@ -247,6 +260,9 @@ cdef class MinHash(object):
     def add_hash(self, uint64_t h):
         deref(self._this).add_hash(h)
 
+    def translate_codon(self, codon):
+        return deref(self._this).translate_codon(to_bytes(codon))
+
     def count_common(self, MinHash other):
         return deref(self._this).count_common(deref(other._this))
 
@@ -255,7 +271,8 @@ cdef class MinHash(object):
             raise ValueError('new sample n is higher than current sample n')
 
         a = MinHash(new_num, deref(self._this).ksize,
-                    deref(self._this).is_protein, self.track_abundance,
+                    deref(self._this).is_protein, deref(self._this).dayhoff,
+                    self.track_abundance,
                     deref(self._this).seed, 0)
         if self.track_abundance:
             a.set_abundances(self.get_mins(with_abundance=True))
@@ -286,7 +303,8 @@ cdef class MinHash(object):
         new_max_hash = get_max_hash_for_scaled(new_num)
 
         a = MinHash(0, deref(self._this).ksize,
-                    deref(self._this).is_protein, self.track_abundance,
+                    deref(self._this).is_protein, deref(self._this).dayhoff,
+                    self.track_abundance,
                     deref(self._this).seed, new_max_hash)
         if self.track_abundance:
             a.set_abundances(self.get_mins(with_abundance=True))
@@ -307,6 +325,7 @@ cdef class MinHash(object):
             combined_mh = new KmerMinAbundance(num,
                                           deref(self._this).ksize,
                                           deref(self._this).is_protein,
+                                          deref(self._this).dayhoff,
                                           deref(self._this).seed,
                                           deref(self._this).max_hash)
 
@@ -314,6 +333,7 @@ cdef class MinHash(object):
             combined_mh = new KmerMinHash(num,
                                           deref(self._this).ksize,
                                           deref(self._this).is_protein,
+                                          deref(self._this).dayhoff,
                                           deref(self._this).seed,
                                           deref(self._this).max_hash)
 
@@ -424,12 +444,27 @@ cdef class MinHash(object):
         if not deref(self._this).is_protein:
             raise ValueError("cannot add amino acid sequence to DNA MinHash!")
 
-        for i in range(0, len(sequence) - ksize + 1):
-            deref(self._this).add_word(to_bytes(sequence[i:i + ksize]))
+        aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1))
+        if not self.dayhoff:
+            for aa_kmer in aa_kmers:
+                deref(self._this).add_word(to_bytes(aa_kmer))
+        else:
+            for aa_kmer in aa_kmers:
+                dayhoff_kmer = ''
+                for aa in aa_kmer:
+                    dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa))
+                    dayhoff_kmer += dayhoff_letter
+                # dayhoff_kmer = ''.join( for aa in aa_kmer)
+                deref(self._this).add_word(to_bytes(dayhoff_kmer))
 
     def is_molecule_type(self, molecule):
         if molecule.upper() == 'DNA' and not self.is_protein:
             return True
-        if molecule == 'protein' and self.is_protein:
-            return True
+        if self.is_protein:
+            if self.dayhoff:
+                if molecule == 'dayhoff':
+                    return True
+            else:
+                if molecule == 'protein':
+                    return True
         return False
diff --git a/sourmash/command_compute.py b/sourmash/command_compute.py
new file mode 100644
index 0000000000..97399bf0ee
--- /dev/null
+++ b/sourmash/command_compute.py
@@ -0,0 +1,618 @@
+"""
+Functions implementing the 'compute' command and related functions.
+"""
+from __future__ import print_function, division, absolute_import
+
+import argparse
+import glob
+import os
+import os.path
+import sys
+import random
+import itertools
+import time
+import collections
+from collections import defaultdict
+from functools import partial
+import numpy as np
+
+import screed
+from .sourmash_args import SourmashArgumentParser
+from . import DEFAULT_SEED, MinHash
+from . import signature as sig
+from . import signature_json
+from . import sourmash_args
+from . import np_utils
+from .logging import notify, error, set_quiet, debug
+
+DEFAULT_COMPUTE_K = '21,31,51'
+from .commands import DEFAULT_N
+DEFAULT_LINE_COUNT = 1500
+
+
+def compute(args):
+    """Compute the signature for one or more files.
+
+    Use cases:
+        sourmash compute multiseq.fa              => multiseq.fa.sig, etc.
+        sourmash compute genome.fa --singleton    => genome.fa.sig
+        sourmash compute file1.fa file2.fa -o file.sig
+            => creates one output file file.sig, with one signature for each
+               input file.
+        sourmash compute file1.fa file2.fa --merge merged -o file.sig
+            => creates one output file file.sig, with all sequences from
+               file1.fa and file2.fa combined into one signature.
+    """
+    parser = SourmashArgumentParser()
+    parser.add_argument('filenames', nargs='+',
+                        help='file(s) of sequences')
+
+    sourmash_args.add_construct_moltype_args(parser)
+
+    parser.add_argument('-q', '--quiet', action='store_true',
+                        help='suppress non-error output')
+    parser.add_argument('--input-is-protein', action='store_true',
+                        help='Consume protein sequences - no translation needed.')
+    parser.add_argument('-k', '--ksizes',
+                        default=DEFAULT_COMPUTE_K,
+                        help='comma-separated list of k-mer sizes (default: %(default)s)')
+    parser.add_argument('-n', '--num-hashes', type=int,
+                        default=DEFAULT_N,
+                        help='number of hashes to use in each sketch (default: %(default)i)')
+    parser.add_argument('--check-sequence', action='store_true',
+                        help='complain if input sequence is invalid (default: False)')
+    parser.add_argument('-f', '--force', action='store_true',
+                        help='recompute signatures even if the file exists (default: False)')
+    parser.add_argument('-o', '--output', type=argparse.FileType('wt'),
+                        help='output computed signatures to this file')
+    parser.add_argument('--singleton', action='store_true',
+                        help='compute a signature for each sequence record individually (default: False)')
+    parser.add_argument('--merge', '--name', type=str, default='', metavar="MERGED",
+                        help="merge all input files into one signature named this")
+    parser.add_argument('--name-from-first', action='store_true',
+                        help="name the signature generated from each file after the first record in the file (default: False)")
+    parser.add_argument('--input-is-10x', action='store_true',
+                        help="Input is 10x single cell output folder (default: False)")
+    parser.add_argument('--count-valid-reads', default=0, type=int,
+                        help="For 10x input only (i.e input-is-10x flag is True), "
+                        "A barcode is only considered a valid barcode read "
+                        "and its signature is written if number of umis are greater "
+                        "than count-valid-reads. It is used to weed out cell barcodes "
+                        "with few umis that might have been due to false rna enzyme reactions")
+    parser.add_argument('--write-barcode-meta-csv', type=str,
+                        help="For 10x input only (i.e input-is-10x flag is True), for each of the unique barcodes, "
+                        "Write to a given path, number of reads and number of umis per barcode.")
+    parser.add_argument('-p', '--processes', default=2, type=int,
+                        help='For 10x input only (i.e input-is-10x flag is True, '
+                        'Number of processes to use for reading 10x bam file')
+    parser.add_argument('--save-fastas', type=str,
+                        help='For 10x input only (i.e input-is-10x flag is True), '
+                        'save merged fastas for all the unique barcodes to {CELL_BARCODE}.fasta '
+                        'save by default in the same directory from which the command is run')
+    parser.add_argument('--line-count', type=int,
+                        help='For 10x input only (i.e input-is-10x flag is True), line count for each bam shard',
+                        default=DEFAULT_LINE_COUNT)
+    parser.add_argument('--track-abundance', action='store_true',
+                        help='track k-mer abundances in the generated signature (default: False)')
+    parser.add_argument('--scaled', type=float, default=0,
+                        help='choose number of hashes as 1 in FRACTION of input k-mers')
+    parser.add_argument('--seed', type=int,
+                        help='seed used by MurmurHash (default: 42)',
+                        default=DEFAULT_SEED)
+    parser.add_argument('--randomize', action='store_true',
+                        help='shuffle the list of input filenames randomly')
+    parser.add_argument('--license', default='CC0', type=str,
+                        help='signature license. Currently only CC0 is supported.')
+    parser.add_argument('--rename-10x-barcodes', type=str,
+                        help="Tab-separated file mapping 10x barcode name "
+                        "to new name, e.g. with channel or cell "
+                        "annotation label", required=False)
+    parser.add_argument('--barcodes-file', type=str,
+                        help="Barcodes file if the input is unfiltered 10x bam file", required=False)
+
+    args = parser.parse_args(args)
+    set_quiet(args.quiet)
+
+    if args.license != 'CC0':
+        error('error: sourmash only supports CC0-licensed signatures. sorry!')
+        sys.exit(-1)
+
+    if args.input_is_protein and args.dna:
+        notify('WARNING: input is protein, turning off nucleotide hashing')
+        args.dna = False
+        args.protein = True
+
+    if args.scaled:
+        if args.scaled < 1:
+            error('ERROR: --scaled value must be >= 1')
+            sys.exit(-1)
+        if args.scaled != round(args.scaled, 0):
+            error('ERROR: --scaled value must be integer value')
+            sys.exit(-1)
+        if args.scaled >= 1e9:
+            notify('WARNING: scaled value is nonsensical!? Continuing anyway.')
+
+        if args.num_hashes != 0:
+            notify('setting num_hashes to 0 because --scaled is set')
+            args.num_hashes = 0
+ 
+    notify('computing signatures for files: {}', ", ".join(args.filenames))
+
+    if args.randomize:
+        notify('randomizing file list because of --randomize')
+        random.shuffle(args.filenames)
+
+    # get list of k-mer sizes for which to compute sketches
+    ksizes = args.ksizes
+    if ',' in ksizes:
+        ksizes = ksizes.split(',')
+        ksizes = list(map(int, ksizes))
+    else:
+        ksizes = [int(ksizes)]
+
+    notify('Computing signature for ksizes: {}', str(ksizes))
+    num_sigs = 0
+    if args.dna and args.protein:
+        notify('Computing both nucleotide and protein signatures.')
+        num_sigs = 2*len(ksizes)
+    elif args.dna and args.dayhoff:
+        notify('Computing both nucleotide and Dayhoff-encoded protein '
+               'signatures.')
+        num_sigs = 2*len(ksizes)
+    elif args.dna:
+        notify('Computing only nucleotide (and not protein) signatures.')
+        num_sigs = len(ksizes)
+    elif args.protein:
+        notify('Computing only protein (and not nucleotide) signatures.')
+        num_sigs = len(ksizes)
+    elif args.dayhoff:
+        notify('Computing only Dayhoff-encoded protein (and not nucleotide) '
+               'signatures.')
+        num_sigs = len(ksizes)
+
+    if (args.protein or args.dayhoff) and not args.input_is_protein:
+        bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ]
+        if bad_ksizes:
+            error('protein ksizes must be divisible by 3, sorry!')
+            error('bad ksizes: {}', ", ".join(bad_ksizes))
+            sys.exit(-1)
+
+    notify('Computing a total of {} signature(s).', num_sigs)
+
+    if num_sigs == 0:
+        error('...nothing to calculate!? Exiting!')
+        sys.exit(-1)
+
+    if args.merge and not args.output:
+        error("must specify -o with --merge")
+        sys.exit(-1)
+
+    def make_minhashes():
+        seed = args.seed
+
+        # one minhash for each ksize
+        Elist = []
+        for k in ksizes:
+            if args.protein:
+                E = MinHash(ksize=k, n=args.num_hashes,
+                            is_protein=True,
+                            dayhoff=False,
+                            track_abundance=args.track_abundance,
+                            scaled=args.scaled,
+                            seed=seed)
+                Elist.append(E)
+            if args.dayhoff:
+                E = MinHash(ksize=k, n=args.num_hashes,
+                            is_protein=True,
+                            dayhoff=True,
+                            track_abundance=args.track_abundance,
+                            scaled=args.scaled,
+                            seed=seed)
+                Elist.append(E)
+            if args.dna:
+                E = MinHash(ksize=k, n=args.num_hashes,
+                            is_protein=False,
+                            dayhoff=False,
+                            track_abundance=args.track_abundance,
+                            scaled=args.scaled,
+                            seed=seed)
+                Elist.append(E)
+        return Elist
+
+    def add_seq(Elist, seq, input_is_protein, check_sequence):
+        for E in Elist:
+            if input_is_protein:
+                E.add_protein(seq)
+            else:
+                E.add_sequence(seq, not check_sequence)
+
+    def build_siglist(Elist, filename, name=None):
+        return [ sig.SourmashSignature(E, filename=filename,
+                                       name=name) for E in Elist ]
+
+    def iter_split(string, sep=None):
+        """Split a string by the given separator and
+        return the results in a generator"""
+        sep = sep or ' '
+        groups = itertools.groupby(string, lambda s: s != sep)
+        return (''.join(g) for k, g in groups if k)
+
+    def fasta_to_sig_record(index):
+        """Convert fasta to sig record"""
+        if umi_filter:
+            return filtered_umi_to_sig(index)
+        else:
+            return unfiltered_umi_to_sig(index)
+
+    def unfiltered_umi_to_sig(index):
+        """Returns signature records across fasta files for a unique barcode"""
+        # Initializing time
+        startt = time.time()
+
+        # Getting all fastas for a given barcode
+        # from different shards
+        single_barcode_fastas = all_fastas_sorted[index]
+
+        count = 0
+        # Iterating through fasta files for single barcode from different
+        # fastas
+        for fasta in iter_split(single_barcode_fastas, ","):
+
+            # Initializing the fasta file to write
+            # all the sequences from all bam shards to
+            if count == 0:
+                unique_fasta_file = os.path.basename(fasta)
+                if args.save_fastas:
+                    f = open(unique_fasta_file, "w")
+
+            # Add sequence
+            for record in screed.open(fasta):
+                sequence = record.sequence
+                add_seq(Elist, sequence,
+                        args.input_is_protein, args.check_sequence)
+
+                # Updating the fasta file with each of the sequences
+                if args.save_fastas:
+                    f.write(">{}\n{}".format(filename, record.sequence))
+
+            # Delete fasta file in tmp folder
+            if os.path.exists(fasta):
+                os.unlink(fasta)
+
+            count += 1
+
+        # Close the opened fasta file
+        if args.save_fastas:
+            f.close()
+
+        # Build signature records
+        barcode_name = unique_fasta_file.replace(".fasta", "")
+        siglist = build_siglist(
+            Elist,
+            os.path.join(args.filenames[0], unique_fasta_file),
+            name=barcode_name)
+        records = signature_json.add_meta_save(siglist)
+
+        notify(
+            "time taken to build signature records for a barcode {} is {:.5f} seconds",
+            unique_fasta_file, time.time() - startt, end='\r')
+        return records
+
+    def filtered_umi_to_sig(index):
+        """Returns signature records for all the fasta files for a unique 
+        barcode, only if it has more than count-valid-reads number of umis."""
+
+        # Initializing time
+        startt = time.time()
+
+        # Getting all fastas for a given barcode
+        # from different shards
+        single_barcode_fastas = all_fastas_sorted[index]
+
+        debug("calculating umi counts", end="\r", flush=True)
+        # Tracking UMI Counts
+        umis = defaultdict(int)
+        # Iterating through fasta files for single barcode from different
+        # fastas
+        for fasta in iter_split(single_barcode_fastas, ","):
+            # calculate unique umi, sequence counts
+            for record in screed.open(fasta):
+                umis[record.name] += record.sequence.count(delimiter)
+
+        if args.write_barcode_meta_csv:
+            unique_fasta_file = os.path.basename(fasta)
+            unique_fasta_file = unique_fasta_file.replace(".fasta", "_meta.txt")
+            with open(unique_fasta_file, "w") as f:
+                f.write("{} {}".format(len(umis), sum(list(umis.values()))))
+
+        debug("Completed tracking umi counts", end="\r", flush=True)
+        if len(umis) < args.count_valid_reads:
+            return []
+        count = 0
+        for fasta in iter_split(single_barcode_fastas, ","):
+
+            # Initializing fasta file to save the sequence to
+            if count == 0:
+                unique_fasta_file = os.path.basename(fasta)
+                if args.save_fastas:
+                    f = open(unique_fasta_file, "w")
+
+            # Add sequences of barcodes with more than count-valid-reads umis
+            for record in screed.open(fasta):
+                sequence = record.sequence
+                add_seq(Elist, sequence,
+                        args.input_is_protein, args.check_sequence)
+
+                # Updating the fasta file with each of the sequences
+                if args.save_fastas:
+                    f.write(">{}\n{}".format(filename, record.sequence))
+
+            # Delete fasta file in tmp folder
+            if os.path.exists(fasta):
+                os.unlink(fasta)
+            count += 1
+
+        debug("Added sequences of unique barcode,umi to Elist", end="\r",
+              flush=True)
+        # Close the opened fasta file
+        if args.save_fastas:
+            f.close()
+        # Build signature records
+        barcode_name = unique_fasta_file.replace(".fasta", "")
+        siglist = build_siglist(
+            Elist,
+            os.path.join(args.filenames[0], unique_fasta_file),
+            name=barcode_name)
+        records = signature_json.add_meta_save(siglist)
+        notify(
+            "time taken to build signature records for a barcode {} is {:.5f} seconds",
+            unique_fasta_file, time.time() - startt, end="\r", flush=True)
+        return records
+
+    def save_siglist(siglist, output_fp, filename=None):
+        # save!
+        if output_fp:
+            sigfile_name = args.output.name
+            sig.save_signatures(siglist, args.output)
+        else:
+            if filename is None:
+                raise Exception("internal error, filename is None")
+            with open(filename, 'w') as fp:
+                sigfile_name = filename
+                sig.save_signatures(siglist, fp)
+        notify(
+            'saved signature(s) to {}. Note: signature license is CC0.',
+            sigfile_name,
+            end="\r")
+
+    def calculate_chunksize(length, num_jobs):
+        chunksize, extra = divmod(length, num_jobs)
+        if extra:
+            chunksize += 1
+        return chunksize
+
+    if args.input_is_10x:
+        all_fastas_sorted = []
+        delimiter = "X"
+        umi_filter = True if args.count_valid_reads != 0 else False
+        Elist = make_minhashes()
+        CELL_BARCODE = "CELL_BARCODE"
+        UMI_COUNT = "UMI_COUNT"
+        READ_COUNT = "READ_COUNT"
+
+    if args.track_abundance:
+        notify('Tracking abundance of input k-mers.')
+
+    if not args.merge:
+        if args.output:
+            siglist = []
+
+        for filename in args.filenames:
+            sigfile = os.path.basename(filename) + '.sig'
+            if not args.output and os.path.exists(sigfile) and not \
+                args.force:
+                notify('skipping {} - already done', filename)
+                continue
+
+            if args.singleton:
+                siglist = []
+                for n, record in enumerate(screed.open(filename)):
+                    # make minhashes for each sequence
+                    Elist = make_minhashes()
+                    add_seq(Elist, record.sequence,
+                            args.input_is_protein, args.check_sequence)
+
+                    siglist += build_siglist(Elist, filename, name=record.name)
+
+                notify('calculated {} signatures for {} sequences in {}',
+                       len(siglist), n + 1, filename)
+            elif args.input_is_10x:
+                from .tenx import (read_barcodes_file, shard_bam_file,
+                                   bam_to_fasta)
+                from pathos import multiprocessing
+
+                # Initializing time
+                startt = time.time()
+
+                # Setting barcodes file, some 10x files don't have a filtered
+                # barcode file
+                if args.barcodes_file is not None:
+                    barcodes = read_barcodes_file(args.barcodes_file)
+                else:
+                    barcodes = None
+
+                # Shard bam file to smaller bam file
+                notify('... reading bam file from {}', filename)
+                n_jobs = args.processes
+                filenames, mmap_file = np_utils.to_memmap(np.array(
+                    shard_bam_file(filename, args.line_count, os.getcwd())))
+
+                # Create a per-cell fasta generator of sequences
+                # If the reads should be filtered by barcodes and umis
+                # umis are saved in fasta file as record name and name of
+                # the fasta file is the barcode
+                func = partial(
+                    bam_to_fasta,
+                    barcodes,
+                    args.rename_10x_barcodes,
+                    delimiter,
+                    umi_filter)
+
+                length_sharded_bam_files = len(filenames)
+                chunksize = calculate_chunksize(length_sharded_bam_files,
+                                                n_jobs)
+                pool = multiprocessing.Pool(processes=n_jobs)
+                notify(
+                    "multiprocessing pool processes {} and chunksize {} calculated", n_jobs, chunksize)
+                # All the fastas are stored in a string instead of a list
+                # This saves memory per element of the list by 8 bits
+                # If we have unique barcodes in the order of 10^6 before
+                # filtering that would result in a huge list if each barcode
+                # is saved as a separate element, hence the string
+                all_fastas = "," .join(itertools.chain(*(
+                    pool.imap(
+                        lambda x: func(x.encode('utf-8')), filenames, chunksize=chunksize))))
+                pool.close()
+                pool.join()
+
+                # clean up the memmap and sharded intermediary bam files
+                [os.unlink(file) for file in filenames if os.path.exists(file)]
+                del filenames
+                os.unlink(mmap_file)
+                notify("Deleted intermediary bam and memmap files")
+
+                # Build a dictionary with each unique barcode as key and
+                # their fasta files from different shards
+                fasta_files_dict = collections.OrderedDict()
+                for fasta in iter_split(all_fastas, ","):
+                    barcode = os.path.basename(fasta).replace(".fasta", "")
+                    value = fasta_files_dict.get(barcode, "")
+                    fasta_files_dict[barcode] = value + fasta + ","
+
+                # Cleaning up to retrieve memory from unused large variables
+                del all_fastas
+                notify("Created fasta_files_dict")
+
+                # Find unique barcodes
+                all_fastas_sorted = list(fasta_files_dict.values())
+                del fasta_files_dict
+                unique_barcodes = len(all_fastas_sorted)
+                notify("Found {} unique barcodes", unique_barcodes)
+
+                # For each barcode, add all their sequences, find
+                # minhash and convert them to signature The below
+                # fasta_to_sig_record also takes into consideration if
+                # umi_filter is True and acts accordingly to create
+                # records for barcodes with considerable number of
+                # umis as provided in count-valid-reads This allows us
+                # to save unique barcodes with valid reads and
+                # significant umis, otherwise signature files save
+                # every barcode even with one umi/one read resulting
+                # in passing every alignment in bam file without file
+                # i.e resulting in a signature file in GB, whereas the
+                # expected .sig file should be in the order of MB
+
+                pool = multiprocessing.Pool(processes=n_jobs)
+                chunksize = calculate_chunksize(unique_barcodes, n_jobs)
+                notify("Pooled {} and chunksize {} mapped", n_jobs, chunksize)
+
+                records = list(itertools.chain(*pool.imap(
+                    lambda index: fasta_to_sig_record(index),
+                    range(unique_barcodes),
+                    chunksize=chunksize)))
+
+                pool.close()
+                pool.join()
+
+                if args.write_barcode_meta_csv:
+
+                    barcodes_meta_txts = glob.glob("*_meta.txt")
+
+                    with open(args.write_barcode_meta_csv, "w") as fp:
+                        fp.write("{},{},{}".format(CELL_BARCODE, UMI_COUNT,
+                                                   READ_COUNT))
+                        fp.write('\n')
+                        for barcode_meta_txt in barcodes_meta_txts:
+                            with open(barcode_meta_txt, 'r') as f:
+                                umi_count, read_count = f.readline().split()
+                                umi_count = int(umi_count)
+                                read_count = int(read_count)
+
+                                barcode_name = barcode_meta_txt.replace('_meta.txt', '')
+                                fp.write("{},{},{}\n".format(barcode_name,
+                                                             umi_count,
+                                                             read_count))
+
+                            os.unlink(barcode_meta_txt)
+
+                filtered_barcode_signatures = len(records)
+                notify("Signature records created for {}",
+                       filtered_barcode_signatures)
+
+                # Save the signature records in .sig file
+                if args.output is not None:
+                    signature_json.write_records_to_json(records, args.output)
+                else:
+                    signature_json.write_records_to_json(records, open(sigfile, "wt"))
+                del records
+
+                notify("time taken to save {} signature records for 10x folder is {:.5f} seconds",
+                       filtered_barcode_signatures, time.time() - startt)
+            else:
+                # make minhashes for the whole file
+                Elist = make_minhashes()
+
+                # consume & calculate signatures
+                notify('... reading sequences from {}', filename)
+                name = None
+                for n, record in enumerate(screed.open(filename)):
+                    if n % 10000 == 0:
+                        if n:
+                            notify('\r...{} {}', filename, n, end='')
+                        elif args.name_from_first:
+                            name = record.name
+
+                    add_seq(Elist, record.sequence,
+                            args.input_is_protein, args.check_sequence)
+
+                notify('...{} {} sequences', filename, n, end='')
+
+                sigs = build_siglist(Elist, filename, name)
+                if args.output:
+                    siglist += sigs
+                else:
+                    siglist = sigs
+
+                notify('calculated {} signatures for {} sequences in {}',
+                       len(sigs), n + 1, filename)
+
+            if not args.output and not args.input_is_10x:
+                save_siglist(siglist, args.output, sigfile)
+
+        if args.output and not args.input_is_10x:
+            save_siglist(siglist, args.output, sigfile)
+    else:                             # single name specified - combine all
+        # make minhashes for the whole file
+        Elist = make_minhashes()
+
+        total_seq = 0
+        for filename in args.filenames:
+            # consume & calculate signatures
+            notify('... reading sequences from {}', filename)
+
+            for n, record in enumerate(screed.open(filename)):
+                if n % 10000 == 0 and n:
+                    notify('\r... {} {}', filename, n, end='')
+
+                add_seq(Elist, record.sequence,
+                        args.input_is_protein, args.check_sequence)
+            notify('... {} {} sequences', filename, n + 1)
+
+            total_seq += n + 1
+
+        siglist = build_siglist(Elist, filename, name=args.merge)
+        notify('calculated {} signatures for {} sequences taken from {} files',
+               len(siglist), total_seq, len(args.filenames))
+
+        # at end, save!
+        save_siglist(siglist, args.output)
diff --git a/sourmash/commands.py b/sourmash/commands.py
index 9c4db3f408..d513d533a3 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -1,28 +1,30 @@
-from __future__ import print_function, division
+"""
+Functions implementing the main command-line subcommands.
+"""
+from __future__ import print_function, division, absolute_import
 
 import argparse
 import csv
-import itertools
-import multiprocessing
 import os
 import os.path
 import sys
-import random
 
 import screed
+from .compare import compare_all_pairs
 from .sourmash_args import SourmashArgumentParser
-from . import DEFAULT_SEED, MinHash, load_sbt_index, create_sbt_index
+from . import MinHash, load_sbt_index, create_sbt_index
 from . import signature as sig
 from . import sourmash_args
 from .logging import notify, error, print_results, set_quiet
 from .sbtmh import SearchMinHashesFindBest, SigLeaf
 
 from .sourmash_args import DEFAULT_LOAD_K
-DEFAULT_COMPUTE_K = '21,31,51'
 
 DEFAULT_N = 500
 WATERMARK_SIZE = 10000
 
+from .command_compute import compute
+
 
 def info(args):
     "Report sourmash version + version of installed dependencies."
@@ -46,310 +48,6 @@ def info(args):
         notify('screed version {}', screed.__version__)
         notify('- loaded from path: {}', os.path.dirname(screed.__file__))
 
-def compute(args):
-    """Compute the signature for one or more files.
-
-    Use cases:
-        sourmash compute multiseq.fa              => multiseq.fa.sig, etc.
-        sourmash compute genome.fa --singleton    => genome.fa.sig
-        sourmash compute file1.fa file2.fa -o file.sig
-            => creates one output file file.sig, with one signature for each
-               input file.
-        sourmash compute file1.fa file2.fa --merge merged -o file.sig
-            => creates one output file file.sig, with all sequences from
-               file1.fa and file2.fa combined into one signature.
-    """
-    parser = SourmashArgumentParser()
-    parser.add_argument('filenames', nargs='+',
-                        help='file(s) of sequences')
-
-    sourmash_args.add_construct_moltype_args(parser)
-
-    parser.add_argument('-q', '--quiet', action='store_true',
-                        help='suppress non-error output')
-    parser.add_argument('--input-is-protein', action='store_true',
-                        help='Consume protein sequences - no translation needed.')
-    parser.add_argument('-k', '--ksizes',
-                        default=DEFAULT_COMPUTE_K,
-                        help='comma-separated list of k-mer sizes (default: %(default)s)')
-    parser.add_argument('-n', '--num-hashes', type=int,
-                        default=DEFAULT_N,
-                        help='number of hashes to use in each sketch (default: %(default)i)')
-    parser.add_argument('--check-sequence', action='store_true',
-                        help='complain if input sequence is invalid (default: False)')
-    parser.add_argument('-f', '--force', action='store_true',
-                        help='recompute signatures even if the file exists (default: False)')
-    parser.add_argument('-o', '--output', type=argparse.FileType('wt'),
-                        help='output computed signatures to this file')
-    parser.add_argument('--singleton', action='store_true',
-                        help='compute a signature for each sequence record individually (default: False)')
-    parser.add_argument('--merge', '--name', type=str, default='', metavar="MERGED",
-                        help="merge all input files into one signature named this")
-    parser.add_argument('--name-from-first', action='store_true',
-                        help="name the signature generated from each file after the first record in the file (default: False)")
-    parser.add_argument('--input-is-10x', action='store_true',
-                        help="Input is 10x single cell output folder (default: False)")
-    parser.add_argument('-p', '--processes', default=2, type=int,
-                        help='Number of processes to use for reading 10x bam file')
-    parser.add_argument('--track-abundance', action='store_true',
-                        help='track k-mer abundances in the generated signature (default: False)')
-    parser.add_argument('--scaled', type=float, default=0,
-                        help='choose number of hashes as 1 in FRACTION of input k-mers')
-    parser.add_argument('--seed', type=int,
-                        help='seed used by MurmurHash (default: 42)',
-                        default=DEFAULT_SEED)
-    parser.add_argument('--randomize', action='store_true',
-                        help='shuffle the list of input filenames randomly')
-    parser.add_argument('--license', default='CC0', type=str,
-                        help='signature license. Currently only CC0 is supported.')
-
-    args = parser.parse_args(args)
-    set_quiet(args.quiet)
-
-    if args.license != 'CC0':
-        error('error: sourmash only supports CC0-licensed signatures. sorry!')
-        sys.exit(-1)
-
-    if args.input_is_protein and args.dna:
-        notify('WARNING: input is protein, turning off nucleotide hashing')
-        args.dna = False
-        args.protein = True
-
-    if args.scaled:
-        if args.scaled < 1:
-            error('ERROR: --scaled value must be >= 1')
-            sys.exit(-1)
-        if args.scaled != round(args.scaled, 0):
-            error('ERROR: --scaled value must be integer value')
-            sys.exit(-1)
-        if args.scaled >= 1e9:
-            notify('WARNING: scaled value is nonsensical!? Continuing anyway.')
-
-        if args.num_hashes != 0:
-            notify('setting num_hashes to 0 because --scaled is set')
-            args.num_hashes = 0
-
-    notify('computing signatures for files: {}', ", ".join(args.filenames))
-
-    if args.randomize:
-        notify('randomizing file list because of --randomize')
-        random.shuffle(args.filenames)
-
-    # get list of k-mer sizes for which to compute sketches
-    ksizes = args.ksizes
-    if ',' in ksizes:
-        ksizes = ksizes.split(',')
-        ksizes = list(map(int, ksizes))
-    else:
-        ksizes = [int(ksizes)]
-
-    notify('Computing signature for ksizes: {}', str(ksizes))
-
-    num_sigs = 0
-    if args.dna and args.protein:
-        notify('Computing both nucleotide and protein signatures.')
-        num_sigs = 2*len(ksizes)
-    elif args.dna:
-        notify('Computing only nucleotide (and not protein) signatures.')
-        num_sigs = len(ksizes)
-    elif args.protein:
-        notify('Computing only protein (and not nucleotide) signatures.')
-        num_sigs = len(ksizes)
-
-    if args.protein and not args.input_is_protein:
-        bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ]
-        if bad_ksizes:
-            error('protein ksizes must be divisible by 3, sorry!')
-            error('bad ksizes: {}', ", ".join(bad_ksizes))
-            sys.exit(-1)
-
-    notify('Computing a total of {} signature(s).', num_sigs)
-
-    if num_sigs == 0:
-        error('...nothing to calculate!? Exiting!')
-        sys.exit(-1)
-
-    if args.merge and not args.output:
-        error("must specify -o with --merge")
-        sys.exit(-1)
-
-    def make_minhashes():
-        seed = args.seed
-
-        # one minhash for each ksize
-        Elist = []
-        for k in ksizes:
-            if args.protein:
-                E = MinHash(ksize=k, n=args.num_hashes,
-                            is_protein=True,
-                            track_abundance=args.track_abundance,
-                            scaled=args.scaled,
-                            seed=seed)
-                Elist.append(E)
-            if args.dna:
-                E = MinHash(ksize=k, n=args.num_hashes,
-                            is_protein=False,
-                            track_abundance=args.track_abundance,
-                            scaled=args.scaled,
-                            seed=seed)
-                Elist.append(E)
-        return Elist
-
-    def add_seq(Elist, seq, input_is_protein, check_sequence):
-        for E in Elist:
-            if input_is_protein:
-                E.add_protein(seq)
-            else:
-                E.add_sequence(seq, not check_sequence)
-
-    def build_siglist(Elist, filename, name=None):
-        return [ sig.SourmashSignature(E, filename=filename,
-                                       name=name) for E in Elist ]
-
-    def save_siglist(siglist, output_fp, filename=None):
-        # save!
-        if output_fp:
-            sig.save_signatures(siglist, args.output)
-        else:
-            if filename is None:
-                raise Exception("internal error, filename is None")
-            with open(filename, 'w') as fp:
-                sig.save_signatures(siglist, fp)
-        notify('saved {} signature(s). Note: signature license is CC0.'.format(len(siglist)))
-
-    def maybe_add_barcode(barcode, cell_seqs):
-        if barcode not in cell_seqs:
-            cell_seqs[barcode] = make_minhashes()
-
-    def maybe_add_alignment(alignment, cell_seqs, args, barcodes):
-        high_quality_mapping = alignment.mapq == 255
-        good_barcode = 'CB' in alignment.tags and \
-                       alignment.get_tag('CB') in barcodes
-        good_umi = 'UB' in alignment.tags
-
-        pass_qc = high_quality_mapping and good_barcode and \
-                  good_umi
-        if pass_qc:
-            barcode = alignment.get_tag('CB')
-            # if this isn't marked a duplicate, count it as a UMI
-            if not alignment.is_duplicate:
-                maybe_add_barcode(barcode, cell_seqs)
-                add_seq(cell_seqs[barcode], alignment.seq,
-                        args.input_is_protein, args.check_sequence)
-
-    if args.track_abundance:
-        notify('Tracking abundance of input k-mers.')
-
-    if not args.merge:
-        if args.output:
-            siglist = []
-
-        for filename in args.filenames:
-            sigfile = os.path.basename(filename) + '.sig'
-            if not args.output and os.path.exists(sigfile) and not \
-                args.force:
-                notify('skipping {} - already done', filename)
-                continue
-
-            if args.singleton:
-                siglist = []
-                for n, record in enumerate(screed.open(filename)):
-                    # make minhashes for each sequence
-                    Elist = make_minhashes()
-                    add_seq(Elist, record.sequence,
-                            args.input_is_protein, args.check_sequence)
-
-                    siglist += build_siglist(Elist, filename, name=record.name)
-
-                notify('calculated {} signatures for {} sequences in {}',
-                       len(siglist), n + 1, filename)
-            elif args.input_is_10x:
-                import pathos.multiprocessing as mp
-                from .tenx import read_10x_folder
-
-                barcodes, bam_file = read_10x_folder(filename)
-                manager = multiprocessing.Manager()
-
-                cell_seqs = manager.dict()
-
-                notify('... reading sequences from {}', filename)
-
-                pool = mp.Pool(processes=args.processes)
-                pool.map(lambda x: maybe_add_alignment(x, cell_seqs, args, barcodes), bam_file)
-                # for n, alignment in enumerate(bam_file):
-                #     if n % 10000 == 0:
-                #         if n:
-                #             notify('\r...{} {}', filename, n, end='')
-                #     maybe_add_alignment(alignment, cell_seqs)
-
-                cell_signatures = [
-                    build_siglist(seqs, filename=filename, name=barcode)
-                    for barcode, seqs in cell_seqs.items()]
-                sigs = list(itertools.chain(*cell_signatures))
-                if args.output:
-                    siglist += sigs
-                else:
-                    siglist = sigs
-
-            else:
-                # make minhashes for the whole file
-                Elist = make_minhashes()
-
-                # consume & calculate signatures
-                notify('... reading sequences from {}', filename)
-                name = None
-                for n, record in enumerate(screed.open(filename)):
-                    if n % 10000 == 0:
-                        if n:
-                            notify('\r...{} {}', filename, n, end='')
-                        elif args.name_from_first:
-                            name = record.name
-
-                    add_seq(Elist, record.sequence,
-                            args.input_is_protein, args.check_sequence)
-
-                notify('...{} {} sequences', filename, n, end='')
-
-                sigs = build_siglist(Elist, filename, name)
-                if args.output:
-                    siglist += sigs
-                else:
-                    siglist = sigs
-
-                notify('calculated {} signatures for {} sequences in {}',
-                       len(sigs), n + 1, filename)
-
-            if not args.output:
-                save_siglist(siglist, args.output, sigfile)
-
-        if args.output:
-            save_siglist(siglist, args.output, sigfile)
-    else:                             # single name specified - combine all
-        # make minhashes for the whole file
-        Elist = make_minhashes()
-
-        total_seq = 0
-        for filename in args.filenames:
-            # consume & calculate signatures
-            notify('... reading sequences from {}', filename)
-
-            for n, record in enumerate(screed.open(filename)):
-                if n % 10000 == 0 and n:
-                    notify('\r... {} {}', filename, n, end='')
-
-                add_seq(Elist, record.sequence,
-                        args.input_is_protein, args.check_sequence)
-            notify('... {} {} sequences', filename, n + 1)
-
-            total_seq += n + 1
-
-        siglist = build_siglist(Elist, filename, name=args.merge)
-        notify('calculated {} signatures for {} sequences taken from {} files',
-               len(siglist), total_seq, len(args.filenames))
-
-        # at end, save!
-        save_siglist(siglist, args.output)
-
 
 def compare(args):
     "Compare multiple signature files and create a distance matrix."
@@ -366,6 +64,8 @@ def compare(args):
                         help='compare all signatures underneath directories.')
     parser.add_argument('--csv', type=argparse.FileType('w'),
                         help='save matrix in CSV format (with column headers)')
+    parser.add_argument('-p', '--processes', type=int,
+                        help='Number of processes to use to calculate similarity')
     parser.add_argument('-q', '--quiet', action='store_true',
                         help='suppress non-error output')
     args = parser.parse_args(args)
@@ -440,31 +140,22 @@ def compare(args):
     notify('')
 
     # build the distance matrix
-    D = numpy.zeros([len(siglist), len(siglist)])
     numpy.set_printoptions(precision=3, suppress=True)
 
     # do all-by-all calculation
-    labeltext = []
-    for i, E in enumerate(siglist):
-        for j, E2 in enumerate(siglist):
-            if i < j:
-                continue
-            similarity = E.similarity(E2, args.ignore_abundance)
-            D[i][j] = similarity
-            D[j][i] = similarity
-
-        labeltext.append(E.name())
 
+    labeltext = [item.name() for item in siglist]
+    similarity = compare_all_pairs(siglist, args.ignore_abundance,
+                                   n_jobs=args.processes)
     if len(siglist) < 30:
         for i, E in enumerate(siglist):
             # for small matrices, pretty-print some output
             name_num = '{}-{}'.format(i, E.name())
             if len(name_num) > 20:
                 name_num = name_num[:17] + '...'
-            print_results('{:20s}\t{}'.format(name_num, D[i, :, ],))
-
-    print_results('min similarity in matrix: {:.3f}', numpy.min(D))
+            print_results('{:20s}\t{}'.format(name_num, similarity[i, :, ],))
 
+    print_results('min similarity in matrix: {:.3f}', numpy.min(similarity))
     # shall we output a matrix?
     if args.output:
         labeloutname = args.output + '.labels.txt'
@@ -474,7 +165,7 @@ def compare(args):
 
         notify('saving distance matrix to: {}', args.output)
         with open(args.output, 'wb') as fp:
-            numpy.save(fp, D)
+            numpy.save(fp, similarity)
 
     # output CSV?
     if args.csv:
@@ -484,7 +175,7 @@ def compare(args):
         for i in range(len(labeltext)):
             y = []
             for j in range(len(labeltext)):
-                y.append('{}'.format(D[i][j]))
+                y.append('{}'.format(similarity[i][j]))
             args.csv.write(','.join(y) + '\n')
 
 
@@ -516,6 +207,7 @@ def plot(args):
                         help="random seed for --subsample; default=1")
     parser.add_argument('-f', '--force', action='store_true',
                         help='forcibly plot non-distance matrices')
+    parser.add_argument('--output-dir', help='directory for output plots')
 
     args = parser.parse_args(args)
 
@@ -552,6 +244,14 @@ def plot(args):
     else:
         hist_out += '.png'
 
+    # output to a different directory?
+    if args.output_dir:
+        if not os.path.isdir(args.output_dir):
+            os.mkdir(args.output_dir)
+        dendrogram_out = os.path.join(args.output_dir, dendrogram_out)
+        matrix_out = os.path.join(args.output_dir, matrix_out)
+        hist_out = os.path.join(args.output_dir, hist_out)
+
     # make the histogram
     notify('saving histogram of matrix values => {}', hist_out)
     fig = pylab.figure(figsize=(8,5))
@@ -575,10 +275,9 @@ def plot(args):
         np_idx = numpy.array(sample_idx)
         D = D[numpy.ix_(np_idx, np_idx)]
         labeltext = [ labeltext[idx] for idx in sample_idx ]
-
     ### do clustering
     Y = sch.linkage(D, method='single')
-    Z1 = sch.dendrogram(Y, orientation='right', labels=labeltext)
+    sch.dendrogram(Y, orientation='right', labels=labeltext)
     fig.savefig(dendrogram_out)
     notify('wrote dendrogram to: {}', dendrogram_out)
 
@@ -636,7 +335,8 @@ def import_csv(args):
 def dump(args):
     parser = SourmashArgumentParser()
     parser.add_argument('filenames', nargs='+')
-    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_LOAD_K, help='k-mer size (default: %(default)i)')
+    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_LOAD_K,
+                        help='k-mer size (default: %(default)i)')
     args = parser.parse_args(args)
 
     for filename in args.filenames:
@@ -662,7 +362,6 @@ def sbt_combine(args):
     sourmash_args.add_moltype_args(parser)
 
     args = parser.parse_args(args)
-    moltype = sourmash_args.calculate_moltype(args)
 
     inp_files = list(args.sbts)
     notify('combining {} SBTs', len(inp_files))
@@ -693,7 +392,7 @@ def index(args):
     parser.add_argument('-d', '--n_children', type=int, default=2,
                         help='Number of children for internal nodes')
     parser.add_argument('--traverse-directory', action='store_true',
-                        help='load all signatures underneath this directory.')
+                        help='load all signatures underneath any directories.')
     parser.add_argument('--append', action='store_true', default=False,
                         help='add signatures to an existing SBT.')
     parser.add_argument('-x', '--bf-size', type=float, default=1e5,
@@ -1110,8 +809,15 @@ def gather(args):
             outname = args.output_unassigned.name
             notify('saving unassigned hashes to "{}"', outname)
 
-            e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
-            e.add_many(next_query.minhash.get_mins())
+            with_abundance = next_query.minhash.track_abundance
+            e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash,
+                        track_abundance=with_abundance)
+            if with_abundance:
+                abunds = next_query.minhash.get_mins(with_abundance=True)
+                e.set_abundances(abunds)
+            else:
+                e.add_many(next_query.minhash.get_mins())
+
             sig.save_signatures([ sig.SourmashSignature(e) ],
                                 args.output_unassigned)
 
@@ -1310,9 +1016,15 @@ def watch(args):
     if args.dna:
         moltype = 'DNA'
         is_protein = False
-    else:
+        dayhoff = False
+    elif args.protein:
         moltype = 'protein'
         is_protein = True
+        dayhoff = False
+    else:
+        moltype = 'dayhoff'
+        is_protein = True
+        dayhoff = True
 
     tree = load_sbt_index(args.sbt_name)
 
@@ -1323,7 +1035,7 @@ def watch(args):
         tree_mh = leaf.data.minhash
         ksize = tree_mh.ksize
 
-    E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein)
+    E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff)
     streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name)
 
     notify('Computing signature for k={}, {} from stdin', ksize, moltype)
diff --git a/sourmash/compare.py b/sourmash/compare.py
new file mode 100644
index 0000000000..7b30a3a7fe
--- /dev/null
+++ b/sourmash/compare.py
@@ -0,0 +1,208 @@
+"""Functionality for comparing many signatures, used in sourmash compare."""
+
+import itertools
+from functools import partial
+import time
+import multiprocessing
+import numpy as np
+
+from .logging import notify
+from sourmash.np_utils import to_memmap
+
+
+def compare_serial(siglist, ignore_abundance, downsample=False):
+    """Compare all combinations of signatures and return a matrix
+    of similarities. Processes combinations serially on a single
+    process. Best to use when there is few signatures.
+
+    :param list siglist: list of signatures to compare
+    :param boolean ignore_abundance
+        If the sketches are not abundance weighted, or ignore_abundance=True,
+        compute Jaccard similarity.
+
+        If the sketches are abundance weighted, calculate a distance metric
+        based on the cosine similarity.
+    :param boolean downsample by max_hash if True
+    :return: np.array similarity matrix
+    """
+
+    n = len(siglist)
+
+    # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
+    iterator = itertools.combinations(range(n), 2)
+
+    similarities = np.ones((n, n))
+
+    for i, j in iterator:
+        similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance, downsample)
+
+    return similarities
+
+
+def similarity(sig1, sig2, ignore_abundance, downsample):
+    """Compute similarity with the other MinHash signature.
+    This function is separated from the SourmashSignature to
+    avoid pickling the whole class during the pool map
+
+    :param sig1 first signature
+    :param sig2 other signature to compare with
+    :param boolean ignore_abundance
+        If the sketches are not abundance weighted, or ignore_abundance=True,
+        compute Jaccard similarity.
+
+        If the sketches are abundance weighted, calculate a distance metric
+        based on the cosine similarity.
+    :param boolean downsample by max_hash if True
+    :return: float similarity of the two signatures
+    """
+
+    try:
+        sig = sig1.minhash.similarity(sig2.minhash, ignore_abundance)
+        return sig
+    except ValueError as e:
+        if 'mismatch in max_hash' in str(e) and downsample:
+            xx = sig1.minhash.downsample_max_hash(sig2.minhash)
+            yy = sig2.minhash.downsample_max_hash(sig1.minhash)
+            sig = similarity(xx, yy, ignore_abundance)
+            return sig
+        else:
+            raise
+
+
+def similarity_args_unpack(args, ignore_abundance, downsample):
+    """Helper function to unpack the arguments. Written to use in pool.imap as it
+    can only be given one argument."""
+    return similarity(*args, ignore_abundance=ignore_abundance, downsample=downsample)
+
+
+def get_similarities_at_index(index, ignore_abundance, downsample, siglist):
+    """Returns similarities of all the combinations of signature at index in the
+    siglist with the rest of the indices starting at index + 1. Doesn't redundantly
+    calculate signatures with all the other indices prior to index - 1
+
+    :param int index: generate masks from this image
+    :param boolean ignore_abundance
+        If the sketches are not abundance weighted, or ignore_abundance=True,
+        compute Jaccard similarity.
+
+        If the sketches are abundance weighted, calculate a distance metric
+        based on the cosine similarity.
+    :param boolean downsample by max_hash if True
+    :param siglist list of signatures
+    :return: list of similarities for the combinations of signature at index with
+    rest of the signatures from index+1
+    """
+    startt = time.time()
+    sig_iterator = itertools.product([siglist[index]], siglist[index + 1:])
+    func = partial(similarity_args_unpack,
+                   ignore_abundance=ignore_abundance,
+                   downsample=downsample)
+    similarity_list = list(map(func, sig_iterator))
+    notify(
+        "comparison for index {} done in {:.5f} seconds",
+        index,
+        time.time() - startt,
+        end='\r')
+    return similarity_list
+
+
+def compare_parallel(siglist, ignore_abundance, downsample, n_jobs):
+    """Compare all combinations of signatures and return a matrix
+    of similarities. Processes combinations parallely on number of processes
+    given by n_jobs
+
+    :param list siglist: list of signatures to compare
+    :param boolean ignore_abundance
+        If the sketches are not abundance weighted, or ignore_abundance=True,
+        compute Jaccard similarity.
+
+        If the sketches are abundance weighted, calculate a distance metric
+        based on the cosine similarity.
+    :param boolean downsample by max_hash if True
+    :param int n_jobs number of processes to run the similarity calculations on
+    :return: np.array similarity matrix
+    """
+    # Starting time - calculate time to keep track in case of lengthy siglist
+    start_initial = time.time()
+
+    # Create a memory map of the siglist using numpy to avoid memory burden
+    # while accessing small parts in it
+    siglist, _ = to_memmap(np.array(siglist))
+    notify("Created memmapped siglist")
+
+    # Check that length of combinations can result in a square similarity matrix
+    length_siglist = len(siglist)
+
+    # Initialize with ones in the diagonal as the similarity of a signature with
+    # itself is one
+    similarities = np.eye(length_siglist, dtype=np.float64)
+    memmap_similarities, filename = to_memmap(similarities)
+    notify("Initialized memmapped similarities matrix")
+
+    # Initialize the function using func.partial with the common arguments like
+    # siglist, ignore_abundance, downsample, for computing all the signatures
+    # The only changing parameter that will be mapped from the pool is the index
+    func = partial(
+        get_similarities_at_index,
+        siglist=siglist,
+        ignore_abundance=ignore_abundance,
+        downsample=downsample)
+    notify("Created similarity func")
+
+    # Initialize multiprocess.pool
+    pool = multiprocessing.Pool(processes=n_jobs)
+
+    # Calculate chunk size, by default pool.imap chunk size is 1
+    chunksize, extra = divmod(length_siglist, n_jobs)
+    if extra:
+        chunksize += 1
+    notify("Calculated chunk size for multiprocessing")
+
+    # This will not generate the results yet, since pool.imap returns a generator
+    result = pool.imap(func, range(length_siglist), chunksize=chunksize)
+    notify("Initialized multiprocessing pool.imap")
+
+    # Enumerate and calculate similarities at each of the indices
+    # and set the results at the appropriate combination coordinate
+    # locations inside the similarity matrix
+    for index, l in enumerate(result):
+        startt = time.time()
+        col_idx = index + 1
+        for idx_condensed, item in enumerate(l):
+            memmap_similarities[index, col_idx + idx_condensed] = memmap_similarities[idx_condensed + col_idx, index] = item
+        notify(
+            "Setting similarities matrix for index {} done in {:.5f} seconds",
+            index,
+            time.time() - startt,
+            end='\r')
+    notify("Setting similarities completed")
+
+    pool.close()
+    pool.join()
+
+    notify("Time taken to compare all pairs parallely is {:.5f} seconds ", time.time() - start_initial)
+    return np.memmap(filename, dtype=np.float64, shape=(length_siglist, length_siglist))
+
+
+def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None):
+    """Compare all combinations of signatures and return a matrix
+    of similarities. Processes combinations either serially or
+    based on parallely on number of processes given by n_jobs
+
+    :param list siglist: list of signatures to compare
+    :param boolean ignore_abundance
+        If the sketches are not abundance weighted, or ignore_abundance=True,
+        compute Jaccard similarity.
+
+        If the sketches are abundance weighted, calculate a distance metric
+        based on the cosine similarity.
+    :param boolean downsample by max_hash if True
+    :param int n_jobs number of processes to run the similarity calculations on,
+    if number of jobs is None or 1, compare serially, otherwise parallely.
+    :return: np.array similarity matrix
+    """
+    if n_jobs is None or n_jobs == 1:
+        similarities = compare_serial(siglist, ignore_abundance, downsample)
+    else:
+        similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs)
+    return similarities
diff --git a/sourmash/kmer_min_hash.hh b/sourmash/kmer_min_hash.hh
index 068e3fde6f..a85b5c0c20 100644
--- a/sourmash/kmer_min_hash.hh
+++ b/sourmash/kmer_min_hash.hh
@@ -2,6 +2,7 @@
 #define KMER_MIN_HASH_HH
 
 #include <algorithm>
+#include <iostream>
 #include <set>
 #include <map>
 #include <queue>
@@ -59,13 +60,14 @@ public:
     const unsigned int num;
     const unsigned int ksize;
     const bool is_protein;
+    const bool dayhoff;
     const uint32_t seed;
     const HashIntoType max_hash;
     CMinHashType mins;
 
-    KmerMinHash(unsigned int n, unsigned int k, bool prot, uint32_t s,
+    KmerMinHash(unsigned int n, unsigned int k, bool prot, bool dyhoff, uint32_t s,
                 HashIntoType mx)
-        : num(n), ksize(k), is_protein(prot), seed(s), max_hash(mx) {
+        : num(n), ksize(k), is_protein(prot), dayhoff(dyhoff), seed(s), max_hash(mx) {
       if (n > 0) {
         mins.reserve(num + 1);
       }
@@ -82,6 +84,9 @@ public:
         if (is_protein != other.is_protein) {
             throw minhash_exception("DNA/prot minhashes cannot be compared");
         }
+        if (dayhoff != other.dayhoff) {
+            throw minhash_exception("DNA/prot minhashes cannot be compared");
+        }
         if (max_hash != other.max_hash) {
             throw minhash_exception("mismatch in max_hash; comparison fail");
         }
@@ -177,19 +182,54 @@ public:
         }
     }
 
-    std::string _dna_to_aa(const std::string& dna) {
-        std::string aa;
-        unsigned int dna_size = (dna.size() / 3) * 3; // floor it
-        for (unsigned int j = 0; j < dna_size; j += 3) {
-            std::string codon = dna.substr(j, 3);
+    std::string translate_codon(std::string& codon) {
+        std::string residue;
+
+        if (codon.length() >= 2 && codon.length() <= 3){
+            // If codon is length 2, pad with an N for ambiguous codon amino acids
+            if (codon.length() == 2) {
+                codon += "N";
+            }
             auto translated = _codon_table.find(codon);
+
             if (translated != _codon_table.end()) {
                 // "second" is the element mapped to by the codon
-                aa += translated -> second;
+                // Because .find returns an iterator
+                residue = translated -> second;
             } else {
                 // Otherwise, assign the "X" or "unknown" amino acid
-                aa += "X";
+                residue = "X";
             }
+        } else if (codon.length() == 1){
+            // Then we only have one nucleotides and the amino acid is unknown
+            residue = "X";
+        } else {
+            std::string msg = "Codon is invalid length: ";
+            msg += codon;
+            throw minhash_exception(msg);
+        }
+        return residue;
+    }
+
+    std::string _dna_to_aa(const std::string& dna) {
+        std::string aa;
+        std::string codon;
+        std::string residue;
+        unsigned int dna_size = (dna.size() / 3) * 3; // floor it
+        for (unsigned int j = 0; j < dna_size; j += 3) {
+
+            codon = dna.substr(j, 3);
+
+            residue = translate_codon(codon);
+
+            // Use dayhoff encoding of amino acids
+            if (dayhoff) {
+                std::string new_letter = aa_to_dayhoff(residue);
+                aa += new_letter;
+            } else {
+                aa += residue;
+            }
+
         }
         return aa;
     }
@@ -226,6 +266,22 @@ public:
         return out;
     }
 
+    std::string aa_to_dayhoff(const std::string& aa) const {
+        // Convert an amino acid letter to dayhoff encoding
+        std::string new_letter;
+
+        auto dayhoff_encoded = _dayhoff_table.find(aa);
+        if (dayhoff_encoded != _dayhoff_table.end()) {
+            // "second" is the element mapped to by the codon
+            // Because .find returns an iterator
+            new_letter = dayhoff_encoded -> second;
+        } else {
+            // Otherwise, assign the "X" or "unknown" amino acid
+            new_letter = "X";
+        }
+        return new_letter;
+    }
+
     virtual void merge(const KmerMinHash& other) {
         check_compatible(other);
 
@@ -263,7 +319,7 @@ private:
         {"TTT", "F"}, {"TTC", "F"},
         {"TTA", "L"}, {"TTG", "L"},
 
-        {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"},
+        {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"}, {"TCN", "S"},
 
         {"TAT", "Y"}, {"TAC", "Y"},
         {"TAA", "*"}, {"TAG", "*"},
@@ -272,19 +328,19 @@ private:
         {"TGA", "*"},
         {"TGG", "W"},
 
-        {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"},
+        {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"}, {"CTN", "L"},
 
-        {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"},
+        {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"}, {"CCN", "P"},
 
         {"CAT", "H"}, {"CAC", "H"},
         {"CAA", "Q"}, {"CAG", "Q"},
 
-        {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"},
+        {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"}, {"CGN", "R"},
 
         {"ATT", "I"}, {"ATC", "I"}, {"ATA", "I"},
         {"ATG", "M"},
 
-        {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"},
+        {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"}, {"ACN", "T"},
 
         {"AAT", "N"}, {"AAC", "N"},
         {"AAA", "K"}, {"AAG", "K"},
@@ -292,24 +348,62 @@ private:
         {"AGT", "S"}, {"AGC", "S"},
         {"AGA", "R"}, {"AGG", "R"},
 
-        {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"},
+        {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"}, {"GTN", "V"},
 
-        {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"},
+        {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"}, {"GCN", "A"},
 
         {"GAT", "D"}, {"GAC", "D"},
         {"GAA", "E"}, {"GAG", "E"},
 
-        {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"}
+        {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"}, {"GGN", "G"}
+    };
+
+
+// Dayhoff table from
+// Peris, P., López, D., & Campos, M. (2008).
+// IgTM: An algorithm to predict transmembrane domains and topology in
+// proteins. BMC Bioinformatics, 9(1), 1029–11.
+// http://doi.org/10.1186/1471-2105-9-367
+//
+// Original source:
+// Dayhoff M. O., Schwartz R. M., Orcutt B. C. (1978).
+// A model of evolutionary change in proteins,
+// in Atlas of Protein Sequence and Structure,
+// ed Dayhoff M. O., editor.
+// (Washington, DC: National Biomedical Research Foundation; ), 345–352.
+//
+// | Amino acid    | Property              | Dayhoff |
+// |---------------|-----------------------|---------|
+// | C             | Sulfur polymerization | a       |
+// | A, G, P, S, T | Small                 | b       |
+// | D, E, N, Q    | Acid and amide        | c       |
+// | H, K, R       | Basic                 | d       |
+// | I, L, M, V    | Hydrophobic           | e       |
+// | F, W, Y       | Aromatic              | f       |
+    std::map<std::string, std::string> _dayhoff_table = {
+        {"C", "a"},
+
+        {"A", "b"}, {"G", "b"}, {"P", "b"}, {"S", "b"}, {"T", "b"},
+
+        {"D", "c"}, {"E", "c"}, {"N", "c"}, {"Q", "c"},
+
+        {"H", "d"}, {"K", "d"}, {"R", "d"},
+
+        {"I", "e"}, {"L", "e"}, {"M", "e"}, {"V", "e"},
+
+        {"F", "f"}, {"W", "f"}, {"Y", "f"}
+
     };
+
 };
 
 class KmerMinAbundance: public KmerMinHash {
  public:
     CMinHashType abunds;
 
-    KmerMinAbundance(unsigned int n, unsigned int k, bool prot, uint32_t seed,
-                     HashIntoType mx) :
-        KmerMinHash(n, k, prot, seed, mx) { };
+    KmerMinAbundance(unsigned int n, unsigned int k, bool prot, bool dayhoff,
+                     uint32_t seed, HashIntoType mx) :
+        KmerMinHash(n, k, prot, dayhoff, seed, mx) { };
 
     virtual void add_hash(HashIntoType h) {
       if ((max_hash and h <= max_hash) or not max_hash) {
@@ -415,7 +509,7 @@ class KmerMinAbundance: public KmerMinHash {
         std::copy(it2_m, other.mins.end(), out_m);
         std::copy(it2_a, other.abunds.end(), out_a);
 
-        if (merged_mins.size() < num) {
+        if (merged_mins.size() < num || !num) {
           mins = merged_mins;
           abunds = merged_abunds;
         } else {
diff --git a/sourmash/np_utils.py b/sourmash/np_utils.py
new file mode 100644
index 0000000000..a06ffd4e0a
--- /dev/null
+++ b/sourmash/np_utils.py
@@ -0,0 +1,20 @@
+import numpy as np
+import tempfile
+
+
+def to_memmap(array):
+    """Write a memory mapped array
+    Create a memory-map to an array stored in a binary file on disk.
+    Memory-mapped files are used for accessing small segments of
+    large files on disk, without reading the entire file into memory.
+    :param np.array array to memory map
+    :return: np.array large_memmap memory mapped array
+    :return: str filename name of the file that memory mapped array is written to
+    """
+    filename = tempfile.NamedTemporaryFile(prefix="array", suffix=".mmap", delete=False).name
+    shape = array.shape
+    f = np.memmap(filename, mode='w+', shape=shape, dtype=array.dtype)
+    f[:] = array[:]
+    del f
+    large_memmap = np.memmap(filename, dtype=array.dtype, shape=shape)
+    return large_memmap, filename
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index e08761387b..295dcc3477 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -46,7 +46,8 @@ def _check_abundance_compatibility(sig1, sig2):
 def _flatten(mh):
     "turn off track_abundance on a MinHash object"
     mh_params = list(mh.__getstate__())
-    mh_params[5] = False
+    # Abundance is 6th parameter
+    mh_params[6] = False
     mh.__setstate__(mh_params)
     assert not mh.track_abundance
 
@@ -54,8 +55,10 @@ def _flatten(mh):
 def _set_num_scaled(mh, num, scaled):
     "set num and scaled values on a MinHash object"
     mh_params = list(mh.__getstate__())
+    # Number of hashes is 0th parameter
     mh_params[0] = num
-    mh_params[6] = get_max_hash_for_scaled(scaled)
+    # Scale is 7th parameter
+    mh_params[7] = get_max_hash_for_scaled(scaled)
     mh.__setstate__(mh_params)
     assert mh.num == num
     assert mh.scaled == scaled
@@ -111,7 +114,10 @@ def describe(args):
         ksize = mh.ksize
         moltype = 'DNA'
         if mh.is_protein:
-            moltype = 'protein'
+            if mh.dayhoff:
+                moltype = 'dayhoff'
+            else:
+                moltype = 'protein'
         scaled = mh.scaled
         num = mh.num
         seed = mh.seed
diff --git a/sourmash/signature.py b/sourmash/signature.py
index d4fd4ab3fe..53d23d402b 100644
--- a/sourmash/signature.py
+++ b/sourmash/signature.py
@@ -99,8 +99,10 @@ def _save(self):
             sketch['mins'] = list(map(int, minhash.get_mins()))
         sketch['md5sum'] = self.md5sum()
 
-        if minhash.is_protein:
+        if minhash.is_protein and not minhash.dayhoff:
             sketch['molecule'] = 'protein'
+        elif minhash.dayhoff:
+            sketch['molecule'] = 'dayhoff'
         else:
             sketch['molecule'] = 'DNA'
 
diff --git a/sourmash/signature_json.py b/sourmash/signature_json.py
index b7c53915fe..143f5a02de 100644
--- a/sourmash/signature_json.py
+++ b/sourmash/signature_json.py
@@ -6,10 +6,9 @@
 # This was written for Python 3, may be there is a chance it will work with Python 2...
 from __future__ import print_function, unicode_literals
 
-import sys
-
 import io
 import json
+import time
 try:
     import ijson.backends.yajl2 as ijson
 except ImportError:
@@ -82,16 +81,23 @@ def _json_next_signature(iterable,
     molecule = d.get('molecule', 'DNA')
     if molecule == 'protein':
         is_protein = True
+        dayhoff = False
+    elif molecule == "dayhoff":
+        is_protein = True
+        dayhoff = True
     elif molecule.upper() == 'DNA':
         is_protein = False
+        dayhoff = False
     else:
         raise Exception("unknown molecule type: {}".format(molecule))
 
+
     track_abundance = False
     if 'abundances' in d:
         track_abundance = True
 
     e = MinHash(ksize=ksize, n=n, is_protein=is_protein,
+                dayhoff=dayhoff,
                 track_abundance=track_abundance,
                 max_hash=max_hash, seed=seed)
 
@@ -229,16 +235,15 @@ def load_signatures_json(data, ksize=None, ignore_md5sum=True, ijson=ijson):
         notify('\r...sig loading {:,}', n, flush=True)
 
 
-def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True):
-    """ Save multiple signatures into a JSON string (or into file handle 'fp')
+def add_meta_save(siglist):
+    """ Convert one signature into a JSON dict
     - siglist: sequence of SourmashSignature objects
-    - fp:
-    - indent: indentation spaces (an integer) or if None no indentation
-    - sort_keys: sort the keys in mappings before writting to JSON
+    - index: index of siglist to save
     """
     from .signature import SIGNATURE_VERSION
-
+    records = []
     top_records = {}
+
     for sig in siglist:
         name, filename, sketch = sig._save()
         k = (name, filename)
@@ -247,9 +252,8 @@ def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True):
         top_records[k] = x
 
     if not top_records:
-        return ""
+        return records
 
-    records = []
     for (name, filename), sketches in top_records.items():
         record = {}
         if name:
@@ -263,9 +267,12 @@ def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True):
         record['hash_function'] = '0.murmur64'
         record['license'] = 'CC0'
         record['email'] = ''
-
         records.append(record)
 
+    return records
+
+
+def write_records_to_json(records, fp=None, indent=None, sort_keys=True):
     s = json.dumps(records, indent=indent, sort_keys=sort_keys, separators=(str(','), str(':')))
     if fp:
         try:
@@ -273,5 +280,21 @@ def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True):
         except TypeError:
             fp.write(unicode(s))
         return None
+    return s
+
 
+def save_signatures_json(
+        siglist, fp=None, indent=None, sort_keys=True):
+    """ Save multiple signatures into a JSON string (or into file handle 'fp')
+    - siglist: sequence of SourmashSignature objects
+    - fp: file handle to the location of a sig file
+    - indent: indentation spaces (an integer) or if None no indentation
+    - sort_keys: sort the keys in mappings before writting to JSON
+    """
+    startt = time.time()
+    records = add_meta_save(siglist)
+    if records == []:
+        return ""
+    s = write_records_to_json(records, fp, indent, sort_keys)
+    notify("time taken to save signatures is {:.5f} seconds", time.time() - startt, end="\r")
     return s
diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py
index 214f6086f7..93562002ce 100644
--- a/sourmash/sourmash_args.py
+++ b/sourmash/sourmash_args.py
@@ -53,6 +53,13 @@ def add_moltype_args(parser):
                         help='do not choose a protein signature')
     parser.set_defaults(protein=False)
 
+    parser.add_argument('--dayhoff', dest='dayhoff', action='store_true',
+                        help='build Dayhoff-encoded amino acid signatures (default: False)')
+    parser.add_argument('--no-dayhoff', dest='dayhoff',
+                        action='store_false',
+                        help='do not build Dayhoff-encoded amino acid signatures')
+    parser.set_defaults(dayhoff=False)
+
     parser.add_argument('--dna', '--rna', dest='dna', default=None,
                         action='store_true',
                         help='choose a nucleotide signature (default: True)')
@@ -70,6 +77,13 @@ def add_construct_moltype_args(parser):
                         help='do not build protein signatures')
     parser.set_defaults(protein=False)
 
+    parser.add_argument('--dayhoff', dest='dayhoff', action='store_true',
+                        help='build Dayhoff-encoded amino acid signatures (default: False)')
+    parser.add_argument('--no-dayhoff', dest='dayhoff',
+                        action='store_false',
+                        help='do not build Dayhoff-encoded amino acid signatures')
+    parser.set_defaults(dayhoff=False)
+
     parser.add_argument('--dna', '--rna', dest='dna', default=None,
                         action='store_true',
                         help='build nucleotide signatures (default: True)')
@@ -89,6 +103,8 @@ def get_moltype(sig, require=False):
         moltype = 'DNA'
     elif sig.minhash.is_molecule_type('protein'):
         moltype = 'protein'
+    elif sig.minhash.is_molecule_type('dayhoff'):
+        moltype = 'dayhoff'
     else:
         raise ValueError('unknown molecule type for sig {}'.format(sig.name()))
 
@@ -107,6 +123,8 @@ def calculate_moltype(args, default=None):
         moltype = 'protein'
     elif args.dna:
         moltype = 'DNA'
+    elif args.dayhoff:
+        moltype = 'dayhoff'
 
     return moltype
 
diff --git a/sourmash/tenx.py b/sourmash/tenx.py
index 32fecf0d4e..8dd3596899 100644
--- a/sourmash/tenx.py
+++ b/sourmash/tenx.py
@@ -1,20 +1,268 @@
+"""
+10x-sequencing specific utility functions.
+"""
+
 import os
+from .logging import notify
+from collections import defaultdict
+import tempfile
+import time
+import numpy as np
+
+CELL_BARCODES = ['CB', 'XC']
+UMIS = ['UB', 'XM']
+
+
+def pass_alignment_qc(alignment, barcodes):
+    """
+    Check high quality mapping, QC-passing barcode and UMI of alignment.
+
+    alignment :
+        aligned bam segment
+    barcodes : list
+        List of cellular barcode strings
+    Returns
+    -------
+    pass_qc : boolean
+        true if a high quality, QC passing barcode with a UMI, false otherwise
+    """
+    high_quality_mapping = alignment.mapq == 255
+    if barcodes is not None:
+        good_cell_barcode = any(
+            [alignment.has_tag(cb) and alignment.get_tag(cb) in barcodes
+             for cb in CELL_BARCODES])
+    else:
+        good_cell_barcode = any([alignment.has_tag(cb) for cb in CELL_BARCODES])
+    good_molecular_barcode = any([alignment.has_tag(umi) for umi in UMIS])
+    not_duplicate = not alignment.is_duplicate
+
+    pass_qc = high_quality_mapping and good_cell_barcode and \
+              good_molecular_barcode and not_duplicate
+    return pass_qc
+
+
+def parse_barcode_renamer(barcodes, barcode_renamer):
+    """
+    Return a dictionary with cell barcode and the renamed barcode.
+
+    barcodes : list
+        List of cellular barcode strings
+    barcode_renamer : str
+        Path to tab-separated file mapping barcodes to their new name
+        e.g. with channel or cell annotation label,
+        e.g. AAATGCCCAAACTGCT-1    lung_epithelial_cell|AAATGCCCAAACTGCT-1
+    Returns
+    -------
+    barcode_renamer : dict
+        A (str, str) mapping of the original barcode to its new name
+    """
+    if barcode_renamer is not None:
+        renamer = {}
+
+        with open(barcode_renamer) as f:
+            for line in f.readlines():
+                barcode, renamed = line.split()
+                assert barcode in barcodes
+                renamer[barcode] = renamed
+    else:
+        renamer = dict(zip(barcodes, barcodes))
+    return renamer
+
+
+def read_barcodes_file(barcode_path):
+    """Read single-column barcodes.tsv and genes.tsv files from 10x.
+
+    Parameters
+    ----------
+    barcode_path : str
+        Name of a 10x 'barcodes.tsv' files
+    Returns
+    -------
+    barcodes : list
+        List of QC-passing barcodes from 'barcodes.tsv'
+    """
+    with open(barcode_path) as f:
+        barcodes = np.unique([line.strip() for line in f])
+    return barcodes
+
+
+def read_bam_file(bam_path):
+    """Read from a QC-pass bam file.
+
+    Parameters
+    ----------
+    bam_path : str
+        Name of a 10x cellranger bam file
+        'possorted_genome_bam.bam'
+    Returns
+    -------
+    bam_file : pysam.AlignmentFile
+        Iterator over possorted_genome_bam.bam file
+    """
+    import pysam
+
+    return pysam.AlignmentFile(bam_path, mode='rb')
+
+
+def shard_bam_file(bam_file_path, chunked_file_line_count, shards_folder):
+    """Shard QC-pass bam file with the given line count and save to shards_folder
+
+    Parameters
+    ----------
+    bam_file_path : str
+        Bam file to shard
+    chunked_file_line_count: int
+        number of lines/alignment reads in each sharded bam file
+    shards_folder: str
+        absolute path to save the sharded bam files to
+    Returns
+    -------
+
+    shards : list
+        list of sharded bam filenames
+    """
+    import pysam
+
+    notify("Sharding the bam file")
+    startt = time.time()
+    file_names = []
+
+    with read_bam_file(bam_file_path) as bam_file:
+        line_count = 0
+        file_count = 0
+        header = bam_file.header
+        for index, alignment in enumerate(bam_file):
+            if line_count == 0:
+                file_name = os.path.join(
+                    shards_folder,
+                    "temp_bam_shard_{}.bam".format(file_count))
+                file_names.append(file_name)
+                outf = pysam.AlignmentFile(file_name, "wb", header=header)
+            if line_count == chunked_file_line_count:
+                file_count = file_count + 1
+                line_count = 0
+                outf.write(alignment)
+                outf.close()
+                notify("===== Sharding bam file ====== {}", file_count,
+                       end="\r")
+            else:
+                outf.write(alignment)
+                line_count = line_count + 1
+        outf.close()
+
+    notify("time taken to shard the bam file into {} shards is {:.5f} seconds",
+           file_count, time.time() - startt)
+    return file_names
+
+
+def bam_to_fasta(barcodes, barcode_renamer, delimiter, umi_filter, bam_file):
+    """Convert 10x bam to one-record-per-cell fasta.
+
+    Parameters
+    ----------
+    bam : bamnostic.AlignmentFile
+    barcodes : list of str
+        QC-passing barcodes
+    barcode_renamer : str or None
+        Tab-separated filename mapping a barcode to a new name, e.g.
+        AAATGCCCAAACTGCT-1    lung_epithelial_cell|AAATGCCCAAACTGCT-1
+    delimiter : str
+        Non-DNA or protein alphabet character to be ignored, e.g. if a cell
+        has two sequences 'AAAAAAAAA' and 'CCCCCCCC', they would be
+        concatenated as 'AAAAAAAAAXCCCCCCCC'.
+    umi_filter : boolean
+        If umi filter is True, then umi is written in place of fasta
+            record name
+        barcode is the fasta file name in the output fastas.
+    Returns
+    -------
+    filenames: list
+        one temp fasta filename for one cell's high-quality, non-duplicate
+        reads
+
+    """
+    bam = read_bam_file(bam_file)
+
+    # Filter out high quality alignments and/or alignments with selected
+    # barcoddes
+    bam_filtered = (x for x in bam if pass_alignment_qc(x, barcodes))
+    if barcode_renamer is not None and barcodes is not None:
+        renamer = parse_barcode_renamer(barcodes, barcode_renamer)
+    else:
+        renamer = None
+    cell_sequences = defaultdict(str)
+
+    for alignment in bam_filtered:
+        # Get barcode of alignment, looks like "AAATGCCCAAACTGCT-1"
+        # a bam file might have good cell barcode as any of the tags in
+        # CELL_BARCODES
+        for cb in CELL_BARCODES:
+            if alignment.has_tag(cb):
+                barcode = alignment.get_tag(cb)
+
+        # If umi_filter is True, add the umi to the cell barcode separated by X
+        # "AAATGCCCAAACTGCT-1" is the CELL_BARCODE and AGCT is the UMI
+        # then the key in cell_sequences would be AAATGCCCAAACTGCT-1XAGCT
+        renamed = renamer[barcode] if renamer is not None else barcode
+        if umi_filter:
+            for umi_tag in UMIS:
+                if alignment.has_tag(umi_tag):
+                    umi = alignment.get_tag(umi_tag)
+            renamed = renamed + delimiter + umi
+
+        # Make a long string of all the cell sequences, separated
+        # by a non-alphabet letter
+        cell_sequences[renamed] += alignment.seq + delimiter
+
+    filenames = list(set(write_cell_sequences(cell_sequences, umi_filter, delimiter)))
+    notify("bam_to_fasta conversion completed on {}", bam_file, end='\r', flush=True)
 
+    bam.close()
 
-def read_single_column(filename):
-    """Read single-column barcodes.tsv and genes.tsv files from 10x"""
-    with open(filename) as f:
-        lines = set(line.strip() for line in f)
-    return lines
+    return filenames
 
 
-def read_10x_folder(folder):
-    """Get QC-pass barcodes, genes, and bam file from a 10x folder"""
-    import bamnostic as bs
+def write_cell_sequences(cell_sequences, umi_filter=False, delimiter="X"):
+    """
+    Write each cell's sequences to an individual file
+        Parameters
+    ----------
+    cell_sequences: dictionary with a cell and corresponding sequence
+    if umi_filter is True, the cell key is expected to contain umi as well
+    separated by the delimiter.
+    e.g cell_key dictionary if umi_filter False is {AAAAAAAAA: AGCTACACTA},
+    else {AAAAAAAAAXACTAG: AGCTACACTA} - In this case AAAAAAAAA would be cell
+    barcode and ACTAG would be umi. The umi will be further used by downstream
+    processing functions appropriately. The barcode is safely returned as the
+    fasta filename and the umi is saved as record.name/sequence id in the
+    fasta file
+    umi_filter : boolean, default False
+        If umi filter is True, then umi is written in the fasta file as
+        record.name, cell barcode is the fasta file name in the output fastas.
+    delimiter : str, default X
+        Used to separate barcode and umi in the cell sequences dict.
 
-    barcodes = read_single_column(os.path.join(folder, 'barcodes.tsv'))
+    Returns
+    -------
+    filenames: generator
+        one temp fasta filename for one cell/cell_umi with  sequence
+    """
+    temp_folder = tempfile.mkdtemp()
 
-    bam_file = bs.AlignmentFile(
-        os.path.join(folder, 'possorted_genome_bam.bam'), mode='rb')
+    for cell, seq in cell_sequences.items():
+        if umi_filter:
+            barcode, umi = cell.split(delimiter)
+            filename = os.path.join(temp_folder, barcode + '.fasta')
 
-    return barcodes, bam_file
+            # Append to an existing barcode file with a different umi
+            with open(filename, "a") as f:
+                if os.stat(filename).st_size != 0:
+                    f.write(">{}\n{}\n".format(umi, seq))
+                else:
+                    f.write(">{}\n{}".format(umi, seq))
+            yield filename
+        else:
+            filename = os.path.join(temp_folder, cell + '.fasta')
+            with open(filename, "w") as f:
+                f.write(">{}\n{}".format(cell, seq))
+            yield filename
diff --git a/src/lib.rs b/src/lib.rs
index 63ce3c1b2b..e9c1b47400 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,8 +29,6 @@ pub mod sketch;
 #[cfg(feature = "from-finch")]
 pub mod from;
 
-pub mod cmd;
-
 use cfg_if::cfg_if;
 use murmurhash3::murmurhash3_x64_128;
 
@@ -39,6 +37,8 @@ cfg_if! {
         pub mod wasm;
     } else {
         pub mod ffi;
+
+        pub mod cmd;
     }
 }
 
diff --git a/src/sketch/minhash.rs b/src/sketch/minhash.rs
index 059e8a17d2..e95ab3fdc5 100644
--- a/src/sketch/minhash.rs
+++ b/src/sketch/minhash.rs
@@ -497,7 +497,7 @@ impl SigsTrait for KmerMinHash {
                 for kmer in sequence.windows(self.ksize as usize) {
                     if _checkdna(kmer) {
                         let rc = revcomp(kmer);
-                        if kmer < &rc {
+                        if kmer < rc.as_slice() {
                             self.add_word(kmer);
                         } else {
                             self.add_word(&rc);
diff --git a/src/wasm.rs b/src/wasm.rs
index a328e64a4e..3bd88e54af 100644
--- a/src/wasm.rs
+++ b/src/wasm.rs
@@ -2,7 +2,8 @@ use wasm_bindgen::prelude::*;
 
 use serde_json;
 
-use crate::KmerMinHash;
+use crate::signature::SigsTrait;
+use crate::sketch::minhash::KmerMinHash;
 
 #[wasm_bindgen]
 impl KmerMinHash {
diff --git a/tests/conftest.py b/tests/conftest.py
index 37ac98a848..28d42ac5c5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,35 @@ def track_abundance(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def dayhoff(request):
+    return request.param
+
+
 @pytest.fixture(params=[2, 5, 10])
 def n_children(request):
     return request.param
+
+
+# --- BEGIN - Only run tests using a particular fixture --- #
+# Cribbed from: http://pythontesting.net/framework/pytest/pytest-run-tests-using-particular-fixture/
+def pytest_collection_modifyitems(items, config):
+    fixture_name = config.option.usesfixture
+    if fixture_name is not None:
+        selected_items = []
+        deselected_items = []
+
+        for item in items:
+            if fixture_name in getattr(item, 'fixturenames', ()):
+                selected_items.append(item)
+            else:
+                deselected_items.append(item)
+        config.hook.pytest_deselected(items=deselected_items)
+        items[:] = selected_items
+
+def pytest_addoption(parser):
+    parser.addoption("--usesfixture",
+                     action="store",
+                     default=None,
+                     help="just run tests that use a particular fixture")
+# --- END - Only run tests using a particular fixture --- #
diff --git a/tests/test-data/10x-example/barcodes_renamer.tsv b/tests/test-data/10x-example/barcodes_renamer.tsv
new file mode 100644
index 0000000000..2bf405423e
--- /dev/null
+++ b/tests/test-data/10x-example/barcodes_renamer.tsv
@@ -0,0 +1,10 @@
+AAACGGGTCTCGTATT-1    lung_epithelial_cell|AAACGGGTCTCGTATT-1
+AAACGGGAGGATATAC-1	  human_epithelial_cell|AAACGGGAGGATATAC-1
+AAAGATGCAGATCTGT-1	  mouse_epithelial_cell|AAAGATGCAGATCTGT-1
+AAATGCCAGATAGTCA-1    lung_epithelial_cell|AAATGCCAGATAGTCA-1
+AAATGCCCAAACTGCT-1    lung_epithelial_cell|AAATGCCCAAACTGCT-1
+AAATGCCGTGAACCTT-1	  lung_epithelial_cell|AAATGCCGTGAACCTT-1
+AACACGTAGTGTACCT-1    lung_epithelial_cell|AACACGTAGTGTACCT-1
+AACACGTGTGGCTCCA-1	  lung_epithelial_cell|AACACGTGTGGCTCCA-1
+AACCATGAGTTGTCGT-1    lung_epithelial_cell|AACCATGAGTTGTCGT-1
+AACCATGCACGTCAGC-1	  lung_epithelial_cell|AACCATGCACGTCAGC-1
diff --git a/tests/test-data/10x-example/possorted_genome_bam_filtered.bam b/tests/test-data/10x-example/possorted_genome_bam_filtered.bam
new file mode 100644
index 0000000000..7266d5a511
Binary files /dev/null and b/tests/test-data/10x-example/possorted_genome_bam_filtered.bam differ
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index e3d593e0df..4508f9440b 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -107,9 +107,9 @@ def test_bytes_dna(track_abundance):
     assert len(b) == 1
 
 
-def test_bytes_protein(track_abundance):
+def test_bytes_protein(track_abundance, dayhoff):
     # verify that we can hash protein/aa sequences
-    mh = MinHash(10, 6, True, track_abundance=track_abundance)
+    mh = MinHash(10, 6, True, dayhoff=dayhoff, track_abundance=track_abundance)
     mh.add_protein('AGYYG')
     mh.add_protein(u'AGYYG')
     mh.add_protein(b'AGYYG')
@@ -117,14 +117,41 @@ def test_bytes_protein(track_abundance):
     assert len(mh.get_mins()) == 4
 
 
-def test_protein(track_abundance):
+def test_protein(track_abundance, dayhoff):
     # verify that we can hash protein/aa sequences
-    mh = MinHash(10, 6, True, track_abundance=track_abundance)
+    mh = MinHash(10, 6, True, dayhoff=dayhoff, track_abundance=track_abundance)
     mh.add_protein('AGYYG')
 
     assert len(mh.get_mins()) == 4
 
 
+def test_translate_codon(track_abundance):
+    # Ensure that translation occurs properly
+    mh = MinHash(10, 6, is_protein=True)
+    assert "S" == mh.translate_codon('TCT')
+    assert "S" == mh.translate_codon('TC')
+    assert "X" == mh.translate_codon("T")
+
+    with pytest.raises(ValueError):
+        mh.translate_codon("")
+        mh.translate_codon("TCTA")
+
+
+def test_dayhoff(track_abundance):
+    # verify that we can hash to dayhoff-encoded protein/aa sequences
+    mh_dayhoff = MinHash(10, 6, is_protein=True,
+                         dayhoff=True, track_abundance=track_abundance)
+    mh_dayhoff.add_sequence('ACTGAC')
+
+    assert len(mh_dayhoff.get_mins()) == 2
+    # verify that dayhoff-encoded hashes are different from protein/aa hashes
+    mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
+    mh_protein.add_sequence('ACTGAC')
+
+    assert len(mh_protein.get_mins()) == 2
+    assert mh_protein.get_mins() != mh_dayhoff.get_mins()
+
+
 def test_protein_short(track_abundance):
     # verify that we can hash protein/aa sequences
     mh = MinHash(10, 9, True, track_abundance=track_abundance)
@@ -483,6 +510,42 @@ def test_mh_merge(track_abundance):
     assert d.compare(c) == 1.0
 
 
+def test_mh_merge_empty_num(track_abundance):
+    # test merging two identically configured minhashes, one empty
+    a = MinHash(20, 10, track_abundance=track_abundance)
+
+    b = MinHash(20, 10, track_abundance=track_abundance)
+    for i in range(0, 80, 4):
+        b.add_hash(i)
+
+    c = a.merge(b)
+    d = b.merge(a)
+
+    assert len(c)
+    assert len(c) == len(d)
+    assert c.get_mins() == d.get_mins()
+    assert c.compare(d) == 1.0
+    assert d.compare(c) == 1.0
+
+
+def test_mh_merge_empty_scaled(track_abundance):
+    # test merging two identically configured minhashes, one empty
+    a = MinHash(0, 10, scaled=1, track_abundance=track_abundance)
+
+    b = MinHash(0, 10, scaled=1, track_abundance=track_abundance)
+    for i in range(0, 80, 4):
+        b.add_hash(i)
+
+    c = a.merge(b)
+    d = b.merge(a)
+
+    assert len(c)
+    assert len(c) == len(d)
+    assert c.get_mins() == d.get_mins()
+    assert c.compare(d) == 1.0
+    assert d.compare(c) == 1.0
+
+
 def test_mh_merge_check_length(track_abundance):
     a = MinHash(20, 10, track_abundance=track_abundance)
     for i in range(0, 40, 2):
diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
index 57b9adad4f..16f3a8ca30 100644
--- a/tests/test_cmd_signature.py
+++ b/tests/test_cmd_signature.py
@@ -4,6 +4,7 @@
 from __future__ import print_function, unicode_literals
 import csv
 import shutil
+import os
 
 import pytest
 
@@ -623,6 +624,85 @@ def test_sig_describe_1(c):
     for line in expected_output:
         assert line.strip() in out
 
+@utils.in_tempdir
+def test_sig_describe_1_dayhoff(c):
+    # get basic info on a signature
+    testdata = utils.get_test_data('short.fa')
+    c.run_sourmash('compute', '-k', '21,30',
+                   '--dayhoff', '--protein',
+                   '--dna',
+                   testdata)
+    # stdout should be new signature
+    computed_sig = os.path.join(c.location, 'short.fa.sig')
+    c.run_sourmash('sig', 'describe', computed_sig)
+
+    out = c.last_result.out
+    print(c.last_result)
+
+    # Add final trailing slash for this OS
+    testdata_dirname = os.path.dirname(testdata) + os.sep
+    location = c.location + os.sep
+
+    expected_output = """\
+---
+signature filename: short.fa.sig
+signature: short.fa
+source file: short.fa
+md5: e45a080101751e044d6df861d3d0f3fd
+k=21 molecule=protein num=500 scaled=0 seed=42 track_abundance=0
+size: 500
+signature license: CC0
+
+---
+signature filename: short.fa.sig
+signature: short.fa
+source file: short.fa
+md5: ef4fa1f3a90f3873187370f1eacc0d9a
+k=21 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0
+size: 500
+signature license: CC0
+
+---
+signature filename: short.fa.sig
+signature: short.fa
+source file: short.fa
+md5: 1136a8a68420bd93683e45cdaf109b80
+k=21 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0
+size: 500
+signature license: CC0
+
+---
+signature filename: short.fa.sig
+signature: short.fa
+source file: short.fa
+md5: 4244d1612598af044e799587132f007e
+k=30 molecule=protein num=500 scaled=0 seed=42 track_abundance=0
+size: 500
+signature license: CC0
+
+---
+signature filename: short.fa.sig
+signature: short.fa
+source file: short.fa
+md5: 5647819f2eac913e04af51c8d548ad56
+k=30 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0
+size: 500
+signature license: CC0
+
+---
+signature filename: short.fa.sig
+signature: short.fa
+source file: short.fa
+md5: 71f7c111c01785e5f38efad45b00a0e1
+k=30 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0
+size: 500
+signature license: CC0
+""".splitlines()
+    for line in out.splitlines():
+        cleaned_line = line.strip().replace(
+            testdata_dirname, '').replace(location, '')
+        assert cleaned_line in expected_output
+
 
 @utils.in_tempdir
 def test_sig_describe_1_multisig(c):
diff --git a/tests/test_compare.py b/tests/test_compare.py
new file mode 100644
index 0000000000..9555993c6b
--- /dev/null
+++ b/tests/test_compare.py
@@ -0,0 +1,60 @@
+import glob
+import os
+
+import numpy as np
+import pytest
+
+import sourmash
+from sourmash.compare import compare_all_pairs, compare_parallel, compare_serial
+from . import sourmash_tst_utils as utils
+
+
+@pytest.fixture()
+def siglist():
+    demo_path = utils.get_test_data("demo")
+    filenames = sorted(glob.glob(os.path.join(demo_path, "*.sig")))
+    sigs = []
+    for filename in filenames:
+        sigs.extend(sourmash.load_signatures(filename))
+    return sigs
+
+
+@pytest.fixture()
+def ignore_abundance(track_abundance):
+    return not track_abundance
+
+
+def test_compare_serial(siglist, ignore_abundance):
+    similarities = compare_serial(siglist, ignore_abundance, downsample=False)
+
+    true_similarities = np.array(
+        [[1., 0.356, 0.078, 0.086, 0., 0., 0.],
+         [0.356, 1., 0.072, 0.078, 0., 0., 0.],
+         [0.078, 0.072, 1., 0.074, 0., 0., 0.],
+         [0.086, 0.078, 0.074, 1., 0., 0., 0.],
+         [0., 0., 0., 0., 1., 0.382, 0.364],
+         [0., 0., 0., 0., 0.382, 1., 0.386],
+         [0., 0., 0., 0., 0.364, 0.386, 1.]])
+
+    np.testing.assert_array_equal(similarities, true_similarities)
+
+
+def test_compare_parallel(siglist, ignore_abundance):
+    similarities = compare_parallel(siglist, ignore_abundance, downsample=False, n_jobs=2)
+
+    true_similarities = np.array(
+        [[1., 0.356, 0.078, 0.086, 0., 0., 0.],
+         [0.356, 1., 0.072, 0.078, 0., 0., 0.],
+         [0.078, 0.072, 1., 0.074, 0., 0., 0.],
+         [0.086, 0.078, 0.074, 1., 0., 0., 0.],
+         [0., 0., 0., 0., 1., 0.382, 0.364],
+         [0., 0., 0., 0., 0.382, 1., 0.386],
+         [0., 0., 0., 0., 0.364, 0.386, 1.]])
+
+    np.testing.assert_array_equal(similarities, true_similarities)
+
+
+def test_compare_all_pairs(siglist, ignore_abundance):
+    similarities_parallel = compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=2)
+    similarities_serial = compare_serial(siglist, ignore_abundance, downsample=False)
+    np.testing.assert_array_equal(similarities_parallel, similarities_serial)
diff --git a/tests/test_np_utils.py b/tests/test_np_utils.py
new file mode 100644
index 0000000000..50aaa756f4
--- /dev/null
+++ b/tests/test_np_utils.py
@@ -0,0 +1,18 @@
+import numpy as np
+from sourmash.signature import SourmashSignature
+import sourmash
+from sourmash.np_utils import to_memmap
+
+
+def test_memmap():
+
+    e1 = sourmash.MinHash(n=1, ksize=20)
+    sig1 = SourmashSignature(e1)
+
+    e2 = sourmash.MinHash(n=1, ksize=25)
+    sig2 = SourmashSignature(e2)
+    siglist = [sig1, sig2]
+    memmapped, filename = to_memmap(np.array(siglist))
+    # Assert that the data didn't change as a result of memory-mapping
+    np.testing.assert_array_equal(memmapped, siglist)
+    assert filename.endswith(".mmap")
diff --git a/tests/test_signature_json.py b/tests/test_signature_json.py
index 7112d0963f..8a3e3823ad 100644
--- a/tests/test_signature_json.py
+++ b/tests/test_signature_json.py
@@ -11,7 +11,7 @@
                                          load_signatureset_json_iter,
                                          save_signatures_json)
 from collections import OrderedDict
-    
+
 def test__json_next_atomic_array():
     t = (2,3,4,5,6)
     s = json.dumps(t)
@@ -111,6 +111,40 @@ def test_load_signaturesset_json_iter():
     assert len(sig_entries) == 2
 
 
+# integration test more than a unit test
+def test_load_signaturesset_json_iter_molecules():
+
+    t = list()
+    molecules = 'DNA', 'protein', 'dayhoff'
+    names = "Foo", 'Bar', "Biz"
+    filenames = '/tmp/foo', '/tmp/bar', '/tmp/biz'
+
+    for molecule, name, filename in zip(molecules, names, filenames):
+        minhash = (2,3,4,5,6)
+        t.append(OrderedDict((
+            ('class', 'sourmash_signature'),
+            ('name', name),
+            ('filename', filename),
+            ('signatures',
+             (
+                 OrderedDict((('ksize', 21),
+                              ('num', len(minhash)),
+                              #('md5sum', ),
+                              ('molecule', molecule),
+                              ('cardinality', 123456),
+                              ('mins', minhash))),
+             )))))
+
+    s = json.dumps(t)
+    if sys.version_info[0] < 3:
+        s = unicode(s)
+    # no MD5SUM
+    sig_entries = tuple(load_signatureset_json_iter(io.StringIO(s),
+                                                    ignore_md5sum=True,
+                                                    ijson=ijson))
+    # Ensure all molecule types were read properly
+    assert len(sig_entries) == 3
+
 def test_save_load_multisig_json():
     e1 = sourmash.MinHash(n=1, ksize=20)
     sig1 = SourmashSignature(e1)
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 69964abd4a..d7abdf7a5e 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5,7 +5,6 @@
 import os
 import gzip
 import shutil
-import time
 import screed
 import glob
 import json
@@ -26,6 +25,7 @@
 from sourmash import signature
 from sourmash import VERSION
 
+
 def test_run_sourmash():
     status, out, err = utils.runscript('sourmash', [], fail_ok=True)
     assert status != 0                    # no args provided, ok ;)
@@ -56,644 +56,45 @@ def test_sourmash_info_verbose():
     assert "loaded from path" in err
 
 
-def test_do_sourmash_compute():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', testdata1],
-                                           in_directory=location)
-
-        sigfile = os.path.join(location, 'short.fa.sig')
-        assert os.path.exists(sigfile)
-
-        sig = next(signature.load_signatures(sigfile))
-        assert sig.name().endswith('short.fa')
-
-
-def test_do_sourmash_compute_output_valid_file():
-    """ Trigger bug #123 """
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        testdata2 = utils.get_test_data('short2.fa')
-        testdata3 = utils.get_test_data('short3.fa')
-        sigfile = os.path.join(location, 'short.fa.sig')
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '-o', sigfile,
-                                            testdata1,
-                                            testdata2, testdata3],
-                                           in_directory=location)
-
-        assert os.path.exists(sigfile)
-        assert not out # stdout should be empty
-
-        # is it valid json?
-        with open(sigfile, 'r') as f:
-            data = json.load(f)
-
-        filesigs = [sig['filename'] for sig in data]
-        assert all(testdata in filesigs
-                   for testdata in (testdata1, testdata2, testdata3))
-
-
-def test_do_sourmash_compute_output_stdout_valid():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        testdata2 = utils.get_test_data('short2.fa')
-        testdata3 = utils.get_test_data('short3.fa')
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '-o', '-',
-                                            testdata1,
-                                            testdata2, testdata3],
-                                           in_directory=location)
-
-        # is it valid json?
-        data = json.loads(out)
-
-        filesigs = [sig['filename'] for sig in data]
-        assert all(testdata in filesigs
-                   for testdata in (testdata1, testdata2, testdata3))
-
-
-def test_do_sourmash_compute_output_and_name_valid_file():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        testdata2 = utils.get_test_data('short2.fa')
-        testdata3 = utils.get_test_data('short3.fa')
-        sigfile = os.path.join(location, 'short.fa.sig')
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '-o', sigfile,
-                                            '--merge', '"name"',
-                                            testdata1,
-                                            testdata2, testdata3],
-                                           in_directory=location)
-
-        assert os.path.exists(sigfile)
-        assert 'calculated 1 signatures for 4 sequences taken from 3 files' in err
-
-        # is it valid json?
-        with open(sigfile, 'r') as f:
-            data = json.load(f)
-
-        assert len(data) == 1
-
-        all_testdata = " ".join([testdata1, testdata2, testdata3])
-        sigfile_merged = os.path.join(location, 'short.all.fa.sig')
-        cmd = "cat {} | sourmash compute -k 31 -o {} -".format(
-                all_testdata, sigfile_merged)
-        status, out, err = utils.run_shell_cmd(cmd, in_directory=location)
-
-        with open(sigfile_merged, 'r') as f:
-            data_merged = json.load(f)
-
-        assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins']
-
-
-def test_do_sourmash_compute_singleton():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '--singleton',
-                                            testdata1],
-                                           in_directory=location)
-
-        sigfile = os.path.join(location, 'short.fa.sig')
-        assert os.path.exists(sigfile)
-
-        sig = next(signature.load_signatures(sigfile))
-        assert sig.name().endswith('shortName')
-
-
-def test_do_sourmash_compute_10x():
-    bamnostic = pytest.importorskip('bamnostic')
-
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('10x-example')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31',
-                                            '--input-is-10x',
-                                            testdata1],
-                                           in_directory=location)
-
-        sigfile = os.path.join(location, '10x-example.sig')
-        assert os.path.exists(sigfile)
-
-        with open(sigfile) as f:
-            data = json.load(f)
-
-        barcode_signatures = [sig['name'] for sig in data]
-
-        with open(utils.get_test_data('10x-example/barcodes.tsv')) as f:
-            true_barcodes = set(x.strip() for x in f.readlines())
-
-        # Ensure that every cell barcode in barcodes.tsv has a signature
-        assert all(bc in true_barcodes for bc in barcode_signatures)
-
-
-
-def test_do_sourmash_compute_name():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '--merge', 'foo',
-                                            testdata1, '-o', 'foo.sig'],
-                                           in_directory=location)
-
-        sigfile = os.path.join(location, 'foo.sig')
-        assert os.path.exists(sigfile)
-
-        sig = next(signature.load_signatures(sigfile))
-        assert sig.name() == 'foo'
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '--name', 'foo',
-                                            testdata1, '-o', 'foo2.sig'],
-                                           in_directory=location)
-
-        sigfile2 = os.path.join(location, 'foo2.sig')
-        assert os.path.exists(sigfile)
-
-        sig2 = next(signature.load_signatures(sigfile))
-        assert sig2.name() == 'foo'
-        assert sig.name() == sig2.name()
-
-
-def test_do_sourmash_compute_name_fail_no_output():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '--merge', 'foo',
-                                            testdata1],
-                                           in_directory=location,
-                                           fail_ok=True)
-        assert status == -1
-
-
-def test_do_sourmash_compute_merge_fail_no_output():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '--merge', 'foo',
-                                            testdata1],
-                                           in_directory=location,
-                                           fail_ok=True)
-        assert status == -1
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '--name', 'foo',
-                                            testdata1],
-                                           in_directory=location,
-                                           fail_ok=True)
-        assert status == -1
-
-
-def test_do_sourmash_compute_name_from_first():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short3.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '31', '--name-from-first',
-                                            testdata1],
-                                           in_directory=location)
-
-        sigfile = os.path.join(location, 'short3.fa.sig')
-        assert os.path.exists(sigfile)
-
-        sig = next(signature.load_signatures(sigfile))
-        assert sig.name() == 'firstname'
-
-
-def test_do_sourmash_compute_multik():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            testdata1],
-                                           in_directory=location)
-        outfile = os.path.join(location, 'short.fa.sig')
-        assert os.path.exists(outfile)
-
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-        ksizes = set([ x.minhash.ksize for x in siglist ])
-        assert 21 in ksizes
-        assert 31 in ksizes
-
-
-def test_do_sourmash_compute_multik_with_protein():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,30',
-                                            '--protein',
-                                            testdata1],
-                                           in_directory=location)
-        outfile = os.path.join(location, 'short.fa.sig')
-        assert os.path.exists(outfile)
-
-        with open(outfile, 'rt') as fp:
-            sigdata = fp.read()
-            siglist = list(signature.load_signatures(sigdata))
-            assert len(siglist) == 4
-            ksizes = set([ x.minhash.ksize for x in siglist ])
-            assert 21 in ksizes
-            assert 30 in ksizes
-
-
-def test_do_sourmash_compute_multik_with_nothing():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--no-protein', '--no-dna',
-                                            testdata1],
-                                           in_directory=location,
-                                           fail_ok=True)
-        outfile = os.path.join(location, 'short.fa.sig')
-        assert not os.path.exists(outfile)
-
-
-def test_do_sourmash_compute_multik_protein_bad_ksize():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '20,32',
-                                            '--protein', '--no-dna',
-                                            testdata1],
-                                           in_directory=location,
-                                           fail_ok=True)
-        outfile = os.path.join(location, 'short.fa.sig')
-        assert not os.path.exists(outfile)
-        assert 'protein ksizes must be divisible by 3' in err
-
-
 @utils.in_tempdir
-def test_do_sourmash_compute_multik_only_protein(c):
-    # check sourmash compute with only protein, no nucl
-    testdata1 = utils.get_test_data('short.fa')
-    c.run_sourmash('compute', '-k', '21,30',
-                   '--protein', '--no-dna', testdata1)
-    outfile = os.path.join(c.location, 'short.fa.sig')
-    assert os.path.exists(outfile)
-
-    with open(outfile, 'rt') as fp:
-        sigdata = fp.read()
-        siglist = list(signature.load_signatures(sigdata))
-        assert len(siglist) == 2
-        ksizes = set([ x.minhash.ksize for x in siglist ])
-        assert 21 in ksizes
-        assert 30 in ksizes
-
-
-def test_do_sourmash_compute_multik_protein_input_non_div3_ksize():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short-protein.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '20,32',
-                                            '--protein', '--no-dna',
-                                            '--input-is-protein',
-                                            testdata1],
-                                           in_directory=location,
-                                           fail_ok=True)
-        outfile = os.path.join(location, 'short-protein.fa.sig')
-        assert os.path.exists(outfile)
-
-
-@utils.in_tempdir
-def test_do_sourmash_compute_multik_only_protein_no_rna(c):
-    # test --no-rna as well (otherwise identical to previous test)
-    testdata1 = utils.get_test_data('short.fa')
-
-    c.run_sourmash('compute', '-k', '21,30',
-                   '--protein', '--no-rna', testdata1)
-    outfile = os.path.join(c.location, 'short.fa.sig')
-    assert os.path.exists(outfile)
-
-    with open(outfile, 'rt') as fp:
-        sigdata = fp.read()
-        siglist = list(signature.load_signatures(sigdata))
-        assert len(siglist) == 2
-        ksizes = set([ x.minhash.ksize for x in siglist ])
-        assert 21 in ksizes
-        assert 30 in ksizes
-
-
-def test_do_sourmash_compute_protein_bad_sequences():
-    """Proper error handling when Ns in dna sequence"""
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.bad.fa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,30',
-                                            '--protein', '--no-dna',
-                                            testdata1],
-                                           in_directory=location)
-        outfile = os.path.join(location, 'short.bad.fa.sig')
-        assert os.path.exists(outfile)
-
-        with open(outfile, 'rt') as fp:
-            sigdata = fp.read()
-            siglist = list(signature.load_signatures(sigdata))
-            assert len(siglist) == 2
-            ksizes = set([ x.minhash.ksize for x in siglist ])
-            assert 21 in ksizes
-            assert 30 in ksizes
-
-
-def test_do_sourmash_compute_multik_input_is_protein():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('ecoli.faa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,30',
-                                            '--input-is-protein',
-                                            testdata1],
-                                           in_directory=location)
-        outfile = os.path.join(location, 'ecoli.faa.sig')
-        assert os.path.exists(outfile)
-
-        with open(outfile, 'rt') as fp:
-            sigdata = fp.read()
-            siglist = list(signature.load_signatures(sigdata))
-            assert len(siglist) == 2
-            ksizes = set([ x.minhash.ksize for x in siglist ])
-            assert 21 in ksizes
-            assert 30 in ksizes
-
-            moltype = set([ x.minhash.is_molecule_type('protein')
-                            for x in siglist ])
-            assert len(moltype) == 1
-            assert True in moltype
-
-
-def test_do_sourmash_compute_multik_outfile():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        outfile = os.path.join(location, 'FOO.xxx')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            testdata1, '-o', outfile],
-                                           in_directory=location)
-        assert os.path.exists(outfile)
-
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-        ksizes = set([ x.minhash.ksize for x in siglist ])
-        assert 21 in ksizes
-        assert 31 in ksizes
-
-
-def test_do_sourmash_compute_with_scaled_1():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        outfile = os.path.join(location, 'FOO.xxx')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--scaled', '1',
-                                            testdata1, '-o', outfile],
-                                            in_directory=location)
-        assert os.path.exists(outfile)
-
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-
-        max_hashes = [ x.minhash.max_hash for x in siglist ]
-        assert len(max_hashes) == 2
-        assert set(max_hashes) == { sourmash.MAX_HASH }
-
-
-def test_do_sourmash_compute_with_scaled_2():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        outfile = os.path.join(location, 'FOO.xxx')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--scaled', '2',
-                                            testdata1, '-o', outfile],
-                                            in_directory=location)
-        assert os.path.exists(outfile)
-
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-
-        max_hashes = [ x.minhash.max_hash for x in siglist ]
-        assert len(max_hashes) == 2
-        assert set(max_hashes) == set([ int(2**64 /2.) ])
-
-
-def test_do_sourmash_compute_with_scaled():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        outfile = os.path.join(location, 'FOO.xxx')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--scaled', '100',
-                                            testdata1, '-o', outfile],
-                                            in_directory=location)
-        assert os.path.exists(outfile)
-
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-
-        max_hashes = [ x.minhash.max_hash for x in siglist ]
-        assert len(max_hashes) == 2
-        assert set(max_hashes) == set([ int(2**64 /100.) ])
-
-
-def test_do_sourmash_compute_with_bad_scaled():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        outfile = os.path.join(location, 'FOO.xxx')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--scaled', '-1',
-                                            testdata1, '-o', outfile],
-                                            in_directory=location,
-                                            fail_ok=True)
-
-        assert status != 0
-        assert '--scaled value must be >= 1' in err
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--scaled', '1000.5',
-                                            testdata1, '-o', outfile],
-                                            in_directory=location,
-                                            fail_ok=True)
-
-        assert status != 0
-        assert '--scaled value must be integer value' in err
-
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--scaled', '1e9',
-                                            testdata1, '-o', outfile],
-                                            in_directory=location)
-
-        assert status == 0
-        assert 'WARNING: scaled value is nonsensical!?' in err
-
-
-def test_do_sourmash_compute_with_seed():
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('short.fa')
-        outfile = os.path.join(location, 'FOO.xxx')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21,31',
-                                            '--seed', '43',
-                                            testdata1, '-o', outfile],
-                                            in_directory=location)
-        assert os.path.exists(outfile)
-
-        siglist = list(signature.load_signatures(outfile))
-        assert len(siglist) == 2
-
-        seeds = [ x.minhash.seed for x in siglist ]
-        assert len(seeds) == 2
-        assert set(seeds) == set([ 43 ])
-
-
-def test_do_sourmash_check_protein_comparisons():
-    # this test checks 2 x 2 protein comparisons with E. coli genes.
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('ecoli.faa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21',
-                                            '--input-is-protein',
-                                            '--singleton',
-                                            testdata1],
-                                           in_directory=location)
-        sig1 = os.path.join(location, 'ecoli.faa.sig')
-        assert os.path.exists(sig1)
-
-        testdata2 = utils.get_test_data('ecoli.genes.fna')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21',
-                                            '--protein', '--no-dna',
-                                            '--singleton',
-                                            testdata2],
-                                           in_directory=location)
-        sig2 = os.path.join(location, 'ecoli.genes.fna.sig')
-        assert os.path.exists(sig2)
-
-        # I'm not sure why load_signatures is randomizing order, but ok.
-        x = list(signature.load_signatures(sig1))
-        sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name())
-
-        x = list(signature.load_signatures(sig2))
-        sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name())
-
-        name1 = sig1_aa.name().split()[0]
-        assert name1 == 'NP_414543.1'
-        name2 = sig2_aa.name().split()[0]
-        assert name2 == 'NP_414544.1'
-        name3 = sig1_trans.name().split()[0]
-        assert name3 == 'gi|556503834:2801-3733'
-        name4 = sig2_trans.name().split()[0]
-        assert name4 == 'gi|556503834:337-2799'
-
-        print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3))
-        print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3))
-        print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3))
-        print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3))
-
-        assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0
-        assert round(sig2_aa.similarity(sig1_trans), 3) == 0.166
-        assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174
-        assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0
-
-
-@utils.in_tempdir
-def test_do_sourmash_check_knowngood_dna_comparisons(c):
-    # this test checks against a known good signature calculated
-    # by utils/compute-dna-mh-another-way.py
-    testdata1 = utils.get_test_data('ecoli.genes.fna')
-    c.run_sourmash('compute', '-k', '21',
-                   '--singleton', '--dna',
-                   testdata1)
-    sig1 = c.output('ecoli.genes.fna.sig')
-    assert os.path.exists(sig1)
-
-    x = list(signature.load_signatures(sig1))
-    sig1, sig2 = sorted(x, key=lambda x: x.name())
-
-    knowngood = utils.get_test_data('benchmark.dna.sig')
-    good = list(signature.load_signatures(knowngood))[0]
-
-    assert sig2.similarity(good) == 1.0
-
-
-@utils.in_tempdir
-def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c):
-    # check the --rna flag; otherwise identical to previous test.
-    testdata1 = utils.get_test_data('ecoli.genes.fna')
-    c.run_sourmash('compute', '-k', '21', '--singleton', '--rna',
-                   testdata1)
-    sig1 = c.output('ecoli.genes.fna.sig')
-    assert os.path.exists(sig1)
-
-    x = list(signature.load_signatures(sig1))
-    sig1, sig2 = sorted(x, key=lambda x: x.name())
-
-    knowngood = utils.get_test_data('benchmark.dna.sig')
-    good = list(signature.load_signatures(knowngood))[0]
-
-    assert sig2.similarity(good) == 1.0
-
-
-def test_do_sourmash_check_knowngood_input_protein_comparisons():
-    # this test checks against a known good signature calculated
-    # by utils/compute-input-prot-another-way.py
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('ecoli.faa')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21',
-                                            '--input-is-protein',
-                                            '--singleton',
-                                            testdata1],
-                                           in_directory=location)
-        sig1 = os.path.join(location, 'ecoli.faa.sig')
-        assert os.path.exists(sig1)
-
-        x = list(signature.load_signatures(sig1))
-        sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name())
-
-        knowngood = utils.get_test_data('benchmark.input_prot.sig')
-        good_aa = list(signature.load_signatures(knowngood))[0]
-
-        assert sig1_aa.similarity(good_aa) == 1.0
+def test_do_serial_compare(c):
+    # try doing a compare serial
+    import numpy
+    testsigs = utils.get_test_data('genome-s1*.sig')
+    testsigs = glob.glob(testsigs)
 
+    c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs)
 
-def test_do_sourmash_check_knowngood_protein_comparisons():
-    # this test checks against a known good signature calculated
-    # by utils/compute-prot-mh-another-way.py
-    with utils.TempDirectory() as location:
-        testdata1 = utils.get_test_data('ecoli.genes.fna')
-        status, out, err = utils.runscript('sourmash',
-                                           ['compute', '-k', '21',
-                                            '--singleton', '--protein',
-                                            '--no-dna',
-                                            testdata1],
-                                           in_directory=location)
-        sig1 = os.path.join(location, 'ecoli.genes.fna.sig')
-        assert os.path.exists(sig1)
+    cmp_outfile = c.output('cmp')
+    assert os.path.exists(cmp_outfile)
+    cmp_out = numpy.load(cmp_outfile.encode('utf-8'))
 
-        x = list(signature.load_signatures(sig1))
-        sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name())
+    sigs = []
+    for fn in testsigs:
+        sigs.append(sourmash.load_one_signature(fn, ksize=21,
+                                                    select_moltype='dna'))
 
-        knowngood = utils.get_test_data('benchmark.prot.sig')
-        good_trans = list(signature.load_signatures(knowngood))[0]
+    cmp_calc = numpy.zeros([len(sigs), len(sigs)])
+    for i, si in enumerate(sigs):
+        for j, sj in enumerate(sigs):
+            cmp_calc[i][j] = si.similarity(sj)
 
-        assert sig2_trans.similarity(good_trans) == 1.0
+        sigs = []
+        for fn in testsigs:
+            sigs.append(sourmash.load_one_signature(fn, ksize=21,
+                                                        select_moltype='dna'))
+    assert (cmp_out == cmp_calc).all()
 
 
 @utils.in_tempdir
-def test_do_basic_compare(c):
-    # try doing a basic compare
+def test_do_compare_parallel(c):
+    # try doing a compare parallel
     import numpy
     testsigs = utils.get_test_data('genome-s1*.sig')
     testsigs = glob.glob(testsigs)
 
-    c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs)
+    c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna',
+                   "--processes", "2", *testsigs)
 
     cmp_outfile = c.output('cmp')
     assert os.path.exists(cmp_outfile)
@@ -762,7 +163,6 @@ def test_do_compare_quiet():
 
 
 def test_do_traverse_directory_compare():
-    import numpy
     with utils.TempDirectory() as location:
         status, out, err = utils.runscript('sourmash',
                                            ['compare', '--traverse-directory',
@@ -866,6 +266,39 @@ def test_do_compare_output_multiple_moltype():
         assert 'multiple molecule types loaded;' in err
 
 
+
+def test_do_compare_dayhoff():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        testdata2 = utils.get_test_data('short2.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21', '--dayhoff',
+                                            '--no-dna', testdata1],
+                                           in_directory=location)
+        assert status == 0
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21', '--dayhoff',
+                                            '--no-dna', testdata2],
+                                           in_directory=location)
+        assert status == 0
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compare', 'short.fa.sig',
+                                            'short2.fa.sig', '--dayhoff',
+                                            '--csv', 'xxx'],
+                                           in_directory=location)
+        true_out = '''[1.   0.94]
+[0.94 1.  ]
+min similarity in matrix: 0.940'''.splitlines()
+        for line in out:
+            cleaned_line = line.split('...')[-1].strip()
+            cleaned_line in true_out
+        assert status == 0
+
+
+
+
 def test_do_plot_comparison():
     with utils.TempDirectory() as location:
         testdata1 = utils.get_test_data('short.fa')
@@ -930,6 +363,31 @@ def test_do_plot_comparison_3():
         assert os.path.exists(os.path.join(location, "cmp.matrix.png"))
 
 
+def test_do_plot_comparison_4_output_dir():
+    with utils.TempDirectory() as location:
+        output_dir = os.path.join(location, 'xyz_test')
+
+        testdata1 = utils.get_test_data('short.fa')
+        testdata2 = utils.get_test_data('short2.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', testdata1, testdata2],
+                                           in_directory=location)
+
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compare', 'short.fa.sig',
+                                            'short2.fa.sig', '-o', 'cmp'],
+                                           in_directory=location)
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['plot', 'cmp', '--labels',
+                                            '--output-dir', output_dir],
+                                           in_directory=location)
+
+        assert os.path.exists(os.path.join(output_dir, "cmp.dendro.png"))
+        assert os.path.exists(os.path.join(output_dir, "cmp.matrix.png"))
+
+
 def test_do_plot_comparison_5_force():
     import numpy
     with utils.TempDirectory() as location:
@@ -3282,6 +2740,20 @@ def test_gather_abund_10_1_ignore_abundance():
         assert 'genome-s12.fa.gz' not in out
 
 
+@utils.in_tempdir
+def test_gather_output_unassigned_with_abundance(c):
+    query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
+    against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig')
+
+    c.run_sourmash('gather', query, against, '--output-unassigned',
+                   c.output('unassigned.sig'))
+
+    assert os.path.exists(c.output('unassigned.sig'))
+
+    ss = sourmash.load_one_signature(c.output('unassigned.sig'))
+    assert ss.minhash.track_abundance
+
+
 def test_sbt_categorize():
     with utils.TempDirectory() as location:
         testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py
new file mode 100644
index 0000000000..a3dea02bd0
--- /dev/null
+++ b/tests/test_sourmash_compute.py
@@ -0,0 +1,772 @@
+"""
+Tests for sourmash compute command-line functionality.
+"""
+from __future__ import print_function, unicode_literals
+import os
+import gzip
+import shutil
+import screed
+import glob
+import json
+import csv
+import pytest
+
+from . import sourmash_tst_utils as utils
+import sourmash
+from sourmash import MinHash
+from sourmash.sbt import SBT, Node
+from sourmash.sbtmh import SigLeaf, load_sbt_index
+
+from sourmash import signature
+from sourmash import VERSION
+
+
+def test_do_sourmash_compute():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', testdata1],
+                                           in_directory=location)
+
+        sigfile = os.path.join(location, 'short.fa.sig')
+        assert os.path.exists(sigfile)
+
+        sig = next(signature.load_signatures(sigfile))
+        assert sig.name().endswith('short.fa')
+
+
+def test_do_sourmash_compute_output_valid_file():
+    """ Trigger bug #123 """
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        testdata2 = utils.get_test_data('short2.fa')
+        testdata3 = utils.get_test_data('short3.fa')
+        sigfile = os.path.join(location, 'short.fa.sig')
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '-o', sigfile,
+                                            testdata1,
+                                            testdata2, testdata3],
+                                           in_directory=location)
+
+        assert os.path.exists(sigfile)
+        assert not out # stdout should be empty
+
+        # is it valid json?
+        with open(sigfile, 'r') as f:
+            data = json.load(f)
+
+        filesigs = [sig['filename'] for sig in data]
+        assert all(testdata in filesigs
+                   for testdata in (testdata1, testdata2, testdata3))
+
+
+def test_do_sourmash_compute_output_stdout_valid():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        testdata2 = utils.get_test_data('short2.fa')
+        testdata3 = utils.get_test_data('short3.fa')
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '-o', '-',
+                                            testdata1,
+                                            testdata2, testdata3],
+                                           in_directory=location)
+
+        # is it valid json?
+        data = json.loads(out)
+
+        filesigs = [sig['filename'] for sig in data]
+        assert all(testdata in filesigs
+                   for testdata in (testdata1, testdata2, testdata3))
+
+
+def test_do_sourmash_compute_output_and_name_valid_file():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        testdata2 = utils.get_test_data('short2.fa')
+        testdata3 = utils.get_test_data('short3.fa')
+        sigfile = os.path.join(location, 'short.fa.sig')
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '-o', sigfile,
+                                            '--merge', '"name"',
+                                            testdata1,
+                                            testdata2, testdata3],
+                                           in_directory=location)
+
+        assert os.path.exists(sigfile)
+        assert 'calculated 1 signatures for 4 sequences taken from 3 files' in err
+
+        # is it valid json?
+        with open(sigfile, 'r') as f:
+            data = json.load(f)
+
+        assert len(data) == 1
+
+        all_testdata = " ".join([testdata1, testdata2, testdata3])
+        sigfile_merged = os.path.join(location, 'short.all.fa.sig')
+        cmd = "cat {} | sourmash compute -k 31 -o {} -".format(
+                all_testdata, sigfile_merged)
+        status, out, err = utils.run_shell_cmd(cmd, in_directory=location)
+
+        with open(sigfile_merged, 'r') as f:
+            data_merged = json.load(f)
+
+        assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins']
+
+
+def test_do_sourmash_compute_singleton():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '--singleton',
+                                            testdata1],
+                                           in_directory=location)
+
+        sigfile = os.path.join(location, 'short.fa.sig')
+        assert os.path.exists(sigfile)
+
+        sig = next(signature.load_signatures(sigfile))
+        assert sig.name().endswith('shortName')
+
+
+def test_do_sourmash_compute_10x():
+    pytest.importorskip('pysam')
+    pytest.importorskip('pathos')
+
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')
+        barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21',
+                                            '--line-count', '50',
+                                            '--input-is-10x',
+                                            '--protein',
+                                            '--barcodes-file',
+                                            barcodes_file,
+                                            testdata1],
+                                           in_directory=location)
+
+        sigfile = os.path.join(location, 'possorted_genome_bam.bam.sig')
+        assert os.path.exists(sigfile)
+        siglist = list(signature.load_signatures(sigfile))
+        assert len(siglist) == 16
+        barcode_signatures = list(set([sig.name() for sig in siglist]))
+
+        with open(utils.get_test_data('10x-example/barcodes.tsv')) as f:
+            true_barcodes = set(x.strip() for x in f.readlines())
+
+        # Ensure that every cell barcode in barcodes.tsv has a signature
+        assert all(bc in true_barcodes for bc in barcode_signatures)
+        min_hashes = [x.minhash.get_mins() for x in siglist]
+        assert all(mins != [] for mins in min_hashes)
+
+        # Filtered bam file with no barcodes file
+        # should run sourmash compute successfully
+        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31',
+                                            '--dna',
+                                            '--input-is-10x',
+                                            testdata1,
+                                            '-o', '10x-example_dna.sig'],
+                                           in_directory=location)
+
+        sigfile = os.path.join(location, '10x-example_dna.sig')
+        assert os.path.exists(sigfile)
+        siglist = list(signature.load_signatures(sigfile))
+        assert len(siglist) == 32
+        min_hashes = [x.minhash.get_mins() for x in siglist]
+        assert all(mins != [] for mins in min_hashes)
+
+        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')
+        csv_path = os.path.join(location, "all_barcodes_meta.csv")
+        barcodes_path = utils.get_test_data('10x-example/barcodes.tsv')
+        renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31',
+                                            '--dna', '--count-valid-reads', '10',
+                                            '--input-is-10x',
+                                            testdata1,
+                                            '--write-barcode-meta-csv', csv_path,
+                                            '--barcodes', barcodes_path,
+                                            '--rename-10x-barcodes', renamer_path,
+                                            '--save-fastas', location,
+                                            '-o', '10x-example_dna.sig'],
+                                           in_directory=location)
+
+        sigfile = os.path.join(location, '10x-example_dna.sig')
+        assert os.path.exists(sigfile)
+        siglist = list(signature.load_signatures(sigfile))
+        assert len(siglist) == 1
+        min_hashes = [x.minhash.get_mins() for x in siglist]
+        assert all(mins != [] for mins in min_hashes)
+
+        with open(csv_path, 'rb') as f:
+            data = [line.split() for line in f]
+        assert len(data) == 9
+        barcodes = [filename.replace(".fasta", "") for filename in os.listdir(location) if filename.endswith('.fasta')]
+        assert len(barcodes) == 1
+        assert barcodes[0] == 'lung_epithelial_cell|AAATGCCCAAACTGCT-1'
+
+
+def test_do_sourmash_compute_name():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '--merge', 'foo',
+                                            testdata1, '-o', 'foo.sig'],
+                                           in_directory=location)
+
+        sigfile = os.path.join(location, 'foo.sig')
+        assert os.path.exists(sigfile)
+
+        sig = next(signature.load_signatures(sigfile))
+        assert sig.name() == 'foo'
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '--name', 'foo',
+                                            testdata1, '-o', 'foo2.sig'],
+                                           in_directory=location)
+
+        sigfile2 = os.path.join(location, 'foo2.sig')
+        assert os.path.exists(sigfile2)
+
+        sig2 = next(signature.load_signatures(sigfile))
+        assert sig2.name() == 'foo'
+        assert sig.name() == sig2.name()
+
+
+def test_do_sourmash_compute_name_fail_no_output():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '--merge', 'foo',
+                                            testdata1],
+                                           in_directory=location,
+                                           fail_ok=True)
+        assert status == -1
+
+
+def test_do_sourmash_compute_merge_fail_no_output():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '--merge', 'foo',
+                                            testdata1],
+                                           in_directory=location,
+                                           fail_ok=True)
+        assert status == -1
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '--name', 'foo',
+                                            testdata1],
+                                           in_directory=location,
+                                           fail_ok=True)
+        assert status == -1
+
+
+def test_do_sourmash_compute_name_from_first():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short3.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '31', '--name-from-first',
+                                            testdata1],
+                                           in_directory=location)
+
+        sigfile = os.path.join(location, 'short3.fa.sig')
+        assert os.path.exists(sigfile)
+
+        sig = next(signature.load_signatures(sigfile))
+        assert sig.name() == 'firstname'
+
+
+def test_do_sourmash_compute_multik():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            testdata1],
+                                           in_directory=location)
+        outfile = os.path.join(location, 'short.fa.sig')
+        assert os.path.exists(outfile)
+
+        siglist = list(signature.load_signatures(outfile))
+        assert len(siglist) == 2
+        ksizes = set([ x.minhash.ksize for x in siglist ])
+        assert 21 in ksizes
+        assert 31 in ksizes
+
+
+def test_do_sourmash_compute_multik_with_protein():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,30',
+                                            '--protein',
+                                            testdata1],
+                                           in_directory=location)
+        outfile = os.path.join(location, 'short.fa.sig')
+        assert os.path.exists(outfile)
+
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 4
+            ksizes = set([ x.minhash.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
+
+
+def test_do_sourmash_compute_multik_with_dayhoff():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,30',
+                                            '--dayhoff', '--no-dna',
+                                            testdata1],
+                                           in_directory=location)
+        assert 'Computing only Dayhoff-encoded protein (and not nucleotide) ' \
+               'signatures.' in err
+        outfile = os.path.join(location, 'short.fa.sig')
+        assert os.path.exists(outfile)
+
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 2
+            ksizes = set([ x.minhash.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
+            assert all(x.minhash.dayhoff for x in siglist)
+
+
+def test_do_sourmash_compute_multik_with_dayhoff_and_dna():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,30',
+                                            '--dayhoff',
+                                            testdata1],
+                                           in_directory=location)
+        outfile = os.path.join(location, 'short.fa.sig')
+        assert os.path.exists(outfile)
+
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 4
+            ksizes = set([ x.minhash.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
+            assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2
+            assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2
+
+
+def test_do_sourmash_compute_multik_with_dayhoff_dna_protein():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,30',
+                                            '--dayhoff', '--protein',
+                                            testdata1],
+                                           in_directory=location)
+        outfile = os.path.join(location, 'short.fa.sig')
+        assert os.path.exists(outfile)
+
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 6
+            ksizes = set([ x.minhash.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
+            assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2
+            assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2
+            assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2
+
+
+def test_do_sourmash_compute_multik_with_nothing():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--no-protein', '--no-dna',
+                                            testdata1],
+                                           in_directory=location,
+                                           fail_ok=True)
+        outfile = os.path.join(location, 'short.fa.sig')
+        assert not os.path.exists(outfile)
+
+
+def test_do_sourmash_compute_multik_protein_bad_ksize():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '20,32',
+                                            '--protein', '--no-dna',
+                                            testdata1],
+                                           in_directory=location,
+                                           fail_ok=True)
+        outfile = os.path.join(location, 'short.fa.sig')
+        assert not os.path.exists(outfile)
+        assert 'protein ksizes must be divisible by 3' in err
+
+
+@utils.in_tempdir
+def test_do_sourmash_compute_multik_only_protein(c):
+    # check sourmash compute with only protein, no nucl
+    testdata1 = utils.get_test_data('short.fa')
+    c.run_sourmash('compute', '-k', '21,30',
+                   '--protein', '--no-dna', testdata1)
+    outfile = os.path.join(c.location, 'short.fa.sig')
+    assert os.path.exists(outfile)
+
+    with open(outfile, 'rt') as fp:
+        sigdata = fp.read()
+        siglist = list(signature.load_signatures(sigdata))
+        assert len(siglist) == 2
+        ksizes = set([ x.minhash.ksize for x in siglist ])
+        assert 21 in ksizes
+        assert 30 in ksizes
+
+
+def test_do_sourmash_compute_multik_protein_input_non_div3_ksize():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short-protein.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '20,32',
+                                            '--protein', '--no-dna',
+                                            '--input-is-protein',
+                                            testdata1],
+                                           in_directory=location,
+                                           fail_ok=True)
+        outfile = os.path.join(location, 'short-protein.fa.sig')
+        assert os.path.exists(outfile)
+
+
+@utils.in_tempdir
+def test_do_sourmash_compute_multik_only_protein_no_rna(c):
+    # test --no-rna as well (otherwise identical to previous test)
+    testdata1 = utils.get_test_data('short.fa')
+
+    c.run_sourmash('compute', '-k', '21,30',
+                   '--protein', '--no-rna', testdata1)
+    outfile = os.path.join(c.location, 'short.fa.sig')
+    assert os.path.exists(outfile)
+
+    with open(outfile, 'rt') as fp:
+        sigdata = fp.read()
+        siglist = list(signature.load_signatures(sigdata))
+        assert len(siglist) == 2
+        ksizes = set([ x.minhash.ksize for x in siglist ])
+        assert 21 in ksizes
+        assert 30 in ksizes
+
+
+def test_do_sourmash_compute_protein_bad_sequences():
+    """Proper error handling when Ns in dna sequence"""
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.bad.fa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,30',
+                                            '--protein', '--no-dna',
+                                            testdata1],
+                                           in_directory=location)
+        outfile = os.path.join(location, 'short.bad.fa.sig')
+        assert os.path.exists(outfile)
+
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 2
+            ksizes = set([ x.minhash.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
+
+
+def test_do_sourmash_compute_multik_input_is_protein():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('ecoli.faa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,30',
+                                            '--input-is-protein',
+                                            testdata1],
+                                           in_directory=location)
+        outfile = os.path.join(location, 'ecoli.faa.sig')
+        assert os.path.exists(outfile)
+
+        with open(outfile, 'rt') as fp:
+            sigdata = fp.read()
+            siglist = list(signature.load_signatures(sigdata))
+            assert len(siglist) == 2
+            ksizes = set([ x.minhash.ksize for x in siglist ])
+            assert 21 in ksizes
+            assert 30 in ksizes
+
+            moltype = set([ x.minhash.is_molecule_type('protein')
+                            for x in siglist ])
+            assert len(moltype) == 1
+            assert True in moltype
+
+
+def test_do_sourmash_compute_multik_outfile():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        outfile = os.path.join(location, 'FOO.xxx')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            testdata1, '-o', outfile],
+                                           in_directory=location)
+        assert os.path.exists(outfile)
+
+        siglist = list(signature.load_signatures(outfile))
+        assert len(siglist) == 2
+        ksizes = set([ x.minhash.ksize for x in siglist ])
+        assert 21 in ksizes
+        assert 31 in ksizes
+
+
+def test_do_sourmash_compute_with_scaled_1():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        outfile = os.path.join(location, 'FOO.xxx')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--scaled', '1',
+                                            testdata1, '-o', outfile],
+                                            in_directory=location)
+        assert os.path.exists(outfile)
+
+        siglist = list(signature.load_signatures(outfile))
+        assert len(siglist) == 2
+
+        max_hashes = [ x.minhash.max_hash for x in siglist ]
+        assert len(max_hashes) == 2
+        assert set(max_hashes) == { sourmash.MAX_HASH }
+
+
+def test_do_sourmash_compute_with_scaled_2():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        outfile = os.path.join(location, 'FOO.xxx')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--scaled', '2',
+                                            testdata1, '-o', outfile],
+                                            in_directory=location)
+        assert os.path.exists(outfile)
+
+        siglist = list(signature.load_signatures(outfile))
+        assert len(siglist) == 2
+
+        max_hashes = [ x.minhash.max_hash for x in siglist ]
+        assert len(max_hashes) == 2
+        assert set(max_hashes) == set([ int(2**64 /2.) ])
+
+
+def test_do_sourmash_compute_with_scaled():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        outfile = os.path.join(location, 'FOO.xxx')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--scaled', '100',
+                                            testdata1, '-o', outfile],
+                                            in_directory=location)
+        assert os.path.exists(outfile)
+
+        siglist = list(signature.load_signatures(outfile))
+        assert len(siglist) == 2
+
+        max_hashes = [ x.minhash.max_hash for x in siglist ]
+        assert len(max_hashes) == 2
+        assert set(max_hashes) == set([ int(2**64 /100.) ])
+
+
+def test_do_sourmash_compute_with_bad_scaled():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        outfile = os.path.join(location, 'FOO.xxx')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--scaled', '-1',
+                                            testdata1, '-o', outfile],
+                                            in_directory=location,
+                                            fail_ok=True)
+
+        assert status != 0
+        assert '--scaled value must be >= 1' in err
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--scaled', '1000.5',
+                                            testdata1, '-o', outfile],
+                                            in_directory=location,
+                                            fail_ok=True)
+
+        assert status != 0
+        assert '--scaled value must be integer value' in err
+
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--scaled', '1e9',
+                                            testdata1, '-o', outfile],
+                                            in_directory=location)
+
+        assert status == 0
+        assert 'WARNING: scaled value is nonsensical!?' in err
+
+
+def test_do_sourmash_compute_with_seed():
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('short.fa')
+        outfile = os.path.join(location, 'FOO.xxx')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21,31',
+                                            '--seed', '43',
+                                            testdata1, '-o', outfile],
+                                            in_directory=location)
+        assert os.path.exists(outfile)
+
+        siglist = list(signature.load_signatures(outfile))
+        assert len(siglist) == 2
+
+        seeds = [ x.minhash.seed for x in siglist ]
+        assert len(seeds) == 2
+        assert set(seeds) == set([ 43 ])
+
+
+def test_do_sourmash_check_protein_comparisons():
+    # this test checks 2 x 2 protein comparisons with E. coli genes.
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('ecoli.faa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21',
+                                            '--input-is-protein',
+                                            '--singleton',
+                                            testdata1],
+                                           in_directory=location)
+        sig1 = os.path.join(location, 'ecoli.faa.sig')
+        assert os.path.exists(sig1)
+
+        testdata2 = utils.get_test_data('ecoli.genes.fna')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21',
+                                            '--protein', '--no-dna',
+                                            '--singleton',
+                                            testdata2],
+                                           in_directory=location)
+        sig2 = os.path.join(location, 'ecoli.genes.fna.sig')
+        assert os.path.exists(sig2)
+
+        # I'm not sure why load_signatures is randomizing order, but ok.
+        x = list(signature.load_signatures(sig1))
+        sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name())
+
+        x = list(signature.load_signatures(sig2))
+        sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name())
+
+        name1 = sig1_aa.name().split()[0]
+        assert name1 == 'NP_414543.1'
+        name2 = sig2_aa.name().split()[0]
+        assert name2 == 'NP_414544.1'
+        name3 = sig1_trans.name().split()[0]
+        assert name3 == 'gi|556503834:2801-3733'
+        name4 = sig2_trans.name().split()[0]
+        assert name4 == 'gi|556503834:337-2799'
+
+        print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3))
+        print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3))
+        print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3))
+        print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3))
+
+        assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0
+        assert round(sig2_aa.similarity(sig1_trans), 3) == 0.166
+        assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174
+        assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0
+
+
+@utils.in_tempdir
+def test_do_sourmash_check_knowngood_dna_comparisons(c):
+    # this test checks against a known good signature calculated
+    # by utils/compute-dna-mh-another-way.py
+    testdata1 = utils.get_test_data('ecoli.genes.fna')
+    c.run_sourmash('compute', '-k', '21',
+                   '--singleton', '--dna',
+                   testdata1)
+    sig1 = c.output('ecoli.genes.fna.sig')
+    assert os.path.exists(sig1)
+
+    x = list(signature.load_signatures(sig1))
+    sig1, sig2 = sorted(x, key=lambda x: x.name())
+
+    knowngood = utils.get_test_data('benchmark.dna.sig')
+    good = list(signature.load_signatures(knowngood))[0]
+
+    assert sig2.similarity(good) == 1.0
+
+
+@utils.in_tempdir
+def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c):
+    # check the --rna flag; otherwise identical to previous test.
+    testdata1 = utils.get_test_data('ecoli.genes.fna')
+    c.run_sourmash('compute', '-k', '21', '--singleton', '--rna',
+                   testdata1)
+    sig1 = c.output('ecoli.genes.fna.sig')
+    assert os.path.exists(sig1)
+
+    x = list(signature.load_signatures(sig1))
+    sig1, sig2 = sorted(x, key=lambda x: x.name())
+
+    knowngood = utils.get_test_data('benchmark.dna.sig')
+    good = list(signature.load_signatures(knowngood))[0]
+
+    assert sig2.similarity(good) == 1.0
+
+
+def test_do_sourmash_check_knowngood_input_protein_comparisons():
+    # this test checks against a known good signature calculated
+    # by utils/compute-input-prot-another-way.py
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('ecoli.faa')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21',
+                                            '--input-is-protein',
+                                            '--singleton',
+                                            testdata1],
+                                           in_directory=location)
+        sig1 = os.path.join(location, 'ecoli.faa.sig')
+        assert os.path.exists(sig1)
+
+        x = list(signature.load_signatures(sig1))
+        sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name())
+
+        knowngood = utils.get_test_data('benchmark.input_prot.sig')
+        good_aa = list(signature.load_signatures(knowngood))[0]
+
+        assert sig1_aa.similarity(good_aa) == 1.0
+
+
+def test_do_sourmash_check_knowngood_protein_comparisons():
+    # this test checks against a known good signature calculated
+    # by utils/compute-prot-mh-another-way.py
+    with utils.TempDirectory() as location:
+        testdata1 = utils.get_test_data('ecoli.genes.fna')
+        status, out, err = utils.runscript('sourmash',
+                                           ['compute', '-k', '21',
+                                            '--singleton', '--protein',
+                                            '--no-dna',
+                                            testdata1],
+                                           in_directory=location)
+        sig1 = os.path.join(location, 'ecoli.genes.fna.sig')
+        assert os.path.exists(sig1)
+
+        x = list(signature.load_signatures(sig1))
+        sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name())
+
+        knowngood = utils.get_test_data('benchmark.prot.sig')
+        good_trans = list(signature.load_signatures(knowngood))[0]
+
+        assert sig2_trans.similarity(good_trans) == 1.0
diff --git a/tests/test_tenx.py b/tests/test_tenx.py
new file mode 100644
index 0000000000..b585303ab8
--- /dev/null
+++ b/tests/test_tenx.py
@@ -0,0 +1,128 @@
+import pytest
+pytest.importorskip('pysam')
+
+from . import sourmash_tst_utils as utils
+import sourmash.tenx as sourmash_tenx
+import pysam as bs
+
+
+def test_read_barcodes_file():
+    filename = utils.get_test_data('10x-example/barcodes.tsv')
+    barcodes = sourmash_tenx.read_barcodes_file(filename)
+    assert len(barcodes) == 10
+
+
+def test_read_bam_file():
+    filename = utils.get_test_data('10x-example/possorted_genome_bam.bam')
+    bam_file = sourmash_tenx.read_bam_file(filename)
+    assert isinstance(bam_file, bs.AlignmentFile)
+    total_alignments = sum(1 for _ in bam_file)
+    assert total_alignments == 1714
+
+
+def test_shard_bam_file():
+    filename = utils.get_test_data('10x-example/possorted_genome_bam.bam')
+    bam_file = sourmash_tenx.read_bam_file(filename)
+    assert isinstance(bam_file, bs.AlignmentFile)
+
+    expected_alignments = sum(1 for _ in bam_file)
+    with utils.TempDirectory() as location:
+        bam_shard_files = sourmash_tenx.shard_bam_file(
+            filename, expected_alignments, location)
+        assert len(bam_shard_files) == 1
+
+    num_shards = 2
+    with utils.TempDirectory() as location:
+        bam_shard_files = sourmash_tenx.shard_bam_file(
+            filename, expected_alignments // num_shards, location)
+        assert len(bam_shard_files) == 2
+
+        total_alignments = 0
+        for bam_file in bam_shard_files:
+            total_alignments += sum(1 for _ in sourmash_tenx.read_bam_file(bam_file))
+        assert total_alignments == expected_alignments
+
+        whole_bam_file = sourmash_tenx.read_bam_file(filename)
+        for bam_file in bam_shard_files:
+            for line in sourmash_tenx.read_bam_file(bam_file):
+                assert line == next(whole_bam_file)
+
+
+def test_pass_alignment_qc():
+    barcodes = sourmash_tenx.read_barcodes_file(
+        utils.get_test_data('10x-example/barcodes.tsv'))
+    bam = sourmash_tenx.read_bam_file(
+        utils.get_test_data('10x-example/possorted_genome_bam.bam'))
+
+    total_pass = sum(1 for alignment in bam if
+                     sourmash_tenx.pass_alignment_qc(alignment, barcodes))
+    assert total_pass == 439
+
+
+def test_pass_alignment_qc_filtered():
+    bam = sourmash_tenx.read_bam_file(
+        utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam'))
+    total_alignments = sum(1 for _ in bam)
+    bam = sourmash_tenx.read_bam_file(
+        utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam'))
+    assert total_alignments == 1500
+    total_pass = sum(1 for alignment in bam if
+                     sourmash_tenx.pass_alignment_qc(alignment, None))
+    assert total_pass == 192
+
+
+def test_parse_barcode_renamer():
+    filename = utils.get_test_data('10x-example/barcodes.tsv')
+    barcodes = sourmash_tenx.read_barcodes_file(filename)
+    renamer = sourmash_tenx.parse_barcode_renamer(barcodes, None)
+    for key, value in renamer.items():
+        assert key == value
+    assert len(renamer) == len(barcodes)
+
+    renamer = sourmash_tenx.parse_barcode_renamer(
+        barcodes, utils.get_test_data('10x-example/barcodes_renamer.tsv'))
+    for key, value in renamer.items():
+        assert key in value
+        assert "epithelial_cell" in value
+    assert len(renamer) == len(barcodes)
+
+
+def test_bam_to_fasta():
+    filename = utils.get_test_data('10x-example/barcodes.tsv')
+    bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam')
+    barcodes = sourmash_tenx.read_barcodes_file(filename)
+
+    fastas = sourmash_tenx.bam_to_fasta(
+        barcodes=barcodes, barcode_renamer=None, delimiter="X", umi_filter=False, bam_file=bam_file)
+    assert len(list(fastas)) == 8
+
+
+def test_filtered_bam_to_fasta():
+    bam_file = utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam')
+    fastas = sourmash_tenx.bam_to_fasta(
+        barcodes=None, barcode_renamer=None, delimiter='X', umi_filter=False, bam_file=bam_file)
+    assert len(list(fastas)) == 32
+
+
+def test_filtered_bam_to_umi_fasta():
+    bam_file = utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam')
+    fastas = sourmash_tenx.bam_to_fasta(
+        barcodes=None, barcode_renamer=None, delimiter='X', umi_filter=True, bam_file=bam_file)
+    assert len(list(fastas)) == 32
+
+
+def test_write_sequences():
+    cell_sequences = {'AAATGCCCAAACTGCT-1': "atgc", 'AAATGCCCAAAGTGCT-1': "gtga"}
+    fastas = list(sourmash_tenx.write_cell_sequences(cell_sequences))
+    assert len(fastas) == len(cell_sequences)
+    for fasta in fastas:
+        assert fasta.endswith(".fasta")
+
+
+def test_write_sequences_umi():
+    cell_sequences = {'AAATGCCCAXAACTGCT-1': "atgc", 'AAATGCCXCAAAGTGCT-1': "gtga", 'AAATGCCXCAAAGTGCT-2': "gtgc"}
+    fastas = list(sourmash_tenx.write_cell_sequences(cell_sequences, True))
+    assert len(fastas) == len(cell_sequences)
+    for fasta in fastas:
+        assert fasta.endswith(".fasta")
+
diff --git a/tox.ini b/tox.ini
index 5afa1e9c42..93ffa78cc1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,7 +9,7 @@ deps=
     codecov
     ipfshttpclient
     redis
-    bamnostic
+    pysam
     pathos
 commands=
     pip install -r requirements.txt