diff --git a/.gitignore b/.gitignore index b763147ae1..db06c2c4ae 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ sourmash/_minhash.cpp .pytest_cache .python-version sourmash/version.py +*.DS_Store \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 72aafe064e..49e9b80133 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,10 +45,10 @@ needletail = { version = "~0.2.1", optional = true } serde = "1.0" serde_derive = "~1.0.58" serde_json = "1.0.2" -ukhs = "0.3.4" +ukhs = { git = "https://github.com/luizirber/ukhs", branch = "feature/alternative_backends", features = ["boomphf_mphf"], default-features = false} bio = { git = "https://github.com/luizirber/rust-bio", branch = "feature/fastx_reader" } primal = "0.2.3" -pdatastructs = "0.5.0" +pdatastructs = { git = "https://github.com/luizirber/pdatastructs.rs", branch = "succinct_wasm" } itertools = "0.8.0" typed-builder = "0.3.0" csv = "1.0.7" diff --git a/Makefile b/Makefile index c51b7ffcd0..079c6e2213 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ doc: .PHONY coverage: all $(PYTHON) setup.py clean --all SOURMASH_COVERAGE=1 $(PYTHON) setup.py build_ext -i - $(PYTHON) -m pytest --cov=. + $(PYTHON) -m pytest --cov=. --cov-report term-missing benchmark: all asv continuous master diff --git a/README.md b/README.md index 91fcb39bfc..e2d6619cb3 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ sourmash is a product of the We recommend using bioconda to install sourmash: ``` -conda install sourmash +conda install -c conda-forge -c bioconda sourmash ``` This will install the latest stable version of sourmash 2. diff --git a/doc/_static/cmp.matrix.png b/doc/_static/cmp.matrix.png index 931d452170..31791c3f9f 100644 Binary files a/doc/_static/cmp.matrix.png and b/doc/_static/cmp.matrix.png differ diff --git a/doc/command-line.md b/doc/command-line.md index 69658f60d7..1755ce60dc 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -20,12 +20,12 @@ taken. Grab three bacterial genomes from NCBI: ``` curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Escherichia_coli/reference/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz -curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Salmonella_enterica/reference/GCF_000006945.1_ASM694v1/GCF_000006945.1_ASM694v1_genomic.fna.gz +curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Salmonella_enterica/reference/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.fna.gz curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Sphingobacteriaceae_bacterium_DW12/latest_assembly_versions/GCF_000783305.1_ASM78330v1/GCF_000783305.1_ASM78330v1_genomic.fna.gz ``` Compute signatures for each: ``` - sourmash compute *.fna.gz + sourmash compute -k 31 *.fna.gz ``` This will produce three `.sig` files containing MinHash signatures at k=31. @@ -33,9 +33,16 @@ Next, compare all the signatures to each other: ``` sourmash compare *.sig -o cmp ``` + +Optionally, parallelize compare to 8 threads with `-p 8`: + +``` +sourmash compare -p 8 *.sig -o cmp +``` + Finally, plot a dendrogram: ``` -sourmash plot cmp +sourmash plot cmp --labels ``` This will output two files, `cmp.dendro.png` and `cmp.matrix.png`, containing a clustering & dendrogram of the sequences, as well as a diff --git a/doc/tutorial-basic.md b/doc/tutorial-basic.md index 8b2d200cae..8a4bf51eda 100644 --- a/doc/tutorial-basic.md +++ b/doc/tutorial-basic.md @@ -185,6 +185,12 @@ Compare all the things: sourmash compare ecoli_many_sigs/* -o ecoli_cmp ``` +Optionally, parallelize to 8 threads using `-p 8`: + +``` +sourmash compare -p 8 ecoli_many_sigs/* -o ecoli_cmp +``` + and then plot: ``` diff --git a/setup.py b/setup.py index 259da9b0de..e47182bfbd 100644 --- a/setup.py +++ b/setup.py @@ -69,10 +69,10 @@ 'setuptools_scm', 'setuptools_scm_git_archive'], "use_scm_version": {"write_to": "sourmash/version.py"}, "extras_require": { - 'test' : ['pytest', 'pytest-cov', 'numpy', 'matplotlib', 'scipy','recommonmark'], + 'test' : ['pytest', 'pytest-cov', 'numpy', 'matplotlib', 'scipy', 'recommonmark'], 'demo' : ['jupyter', 'jupyter_client', 'ipython'], 'doc' : ['sphinx'], - '10x': ['pathos', 'bamnostic>=0.9.2'], + '10x': ['pathos', 'pysam'] }, "include_package_data": True, "package_data": { diff --git a/sourmash/_minhash.pxd b/sourmash/_minhash.pxd index 1d6fe3a2dd..5b0aaf9e3f 100644 --- a/sourmash/_minhash.pxd +++ b/sourmash/_minhash.pxd @@ -25,15 +25,18 @@ cdef extern from "kmer_min_hash.hh": const unsigned int num; const unsigned int ksize; const bool is_protein; + const bool dayhoff; const HashIntoType max_hash; CMinHashType mins; - KmerMinHash(unsigned int, unsigned int, bool, uint32_t, HashIntoType) + KmerMinHash(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType) void add_hash(HashIntoType) except +ValueError void remove_hash(HashIntoType) except +ValueError void add_word(string word) except +ValueError void add_sequence(const char *, bool) except +ValueError void merge(const KmerMinHash&) except +ValueError + string aa_to_dayhoff(string aa) except +ValueError + string translate_codon(string codon) except +ValueError unsigned int count_common(const KmerMinHash&) except +ValueError unsigned long size() @@ -41,13 +44,15 @@ cdef extern from "kmer_min_hash.hh": cdef cppclass KmerMinAbundance(KmerMinHash): CMinHashType abunds; - KmerMinAbundance(unsigned int, unsigned int, bool, uint32_t, HashIntoType) + KmerMinAbundance(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType) void add_hash(HashIntoType) except +ValueError void remove_hash(HashIntoType) except +ValueError void add_word(string word) except +ValueError void add_sequence(const char *, bool) except +ValueError void merge(const KmerMinAbundance&) except +ValueError void merge(const KmerMinHash&) except +ValueError + string aa_to_dayhoff(string aa) except +ValueError + string translate_codon(string codon) except +ValueError unsigned int count_common(const KmerMinAbundance&) except +ValueError unsigned long size() diff --git a/sourmash/_minhash.pyx b/sourmash/_minhash.pyx index af9471c305..beaff29f1c 100644 --- a/sourmash/_minhash.pyx +++ b/sourmash/_minhash.pyx @@ -45,11 +45,15 @@ def get_scaled_for_max_hash(max_hash): cdef bytes to_bytes(s): - if not isinstance(s, (basestring, bytes)): + # Allow for strings, bytes or int + # Single item of byte string = int + if not isinstance(s, (basestring, bytes, int)): raise TypeError("Requires a string-like sequence") if isinstance(s, unicode): s = s.encode('utf-8') + if isinstance(s, int): + s = bytes([s]) return s @@ -88,6 +92,7 @@ cdef class MinHash(object): def __init__(self, unsigned int n, unsigned int ksize, bool is_protein=False, + bool dayhoff=False, bool track_abundance=False, uint32_t seed=MINHASH_DEFAULT_SEED, HashIntoType max_hash=0, @@ -107,9 +112,9 @@ cdef class MinHash(object): cdef KmerMinHash *mh = NULL if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash) else: - mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash) self._this.reset(mh) @@ -122,7 +127,8 @@ cdef class MinHash(object): def __copy__(self): a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, deref(self._this).max_hash) a.merge(self) return a @@ -135,23 +141,24 @@ cdef class MinHash(object): return (deref(self._this).num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, self.get_mins(with_abundance=with_abundance), None, self.track_abundance, deref(self._this).max_hash, deref(self._this).seed) def __setstate__(self, tup): - (n, ksize, is_protein, mins, _, track_abundance, max_hash, seed) =\ + (n, ksize, is_protein, dayhoff, mins, _, track_abundance, max_hash, seed) =\ tup self.track_abundance = track_abundance cdef KmerMinHash *mh = NULL if track_abundance: - mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash) self._this.reset(mh) self.set_abundances(mins) else: - mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash) + mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash) self._this.reset(mh) self.add_many(mins) @@ -160,6 +167,7 @@ cdef class MinHash(object): (deref(self._this).num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, self.track_abundance, deref(self._this).seed, deref(self._this).max_hash, @@ -173,7 +181,8 @@ cdef class MinHash(object): def copy_and_clear(self): a = MinHash(deref(self._this).num, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, deref(self._this).max_hash) return a @@ -234,6 +243,10 @@ cdef class MinHash(object): def is_protein(self): return deref(self._this).is_protein + @property + def dayhoff(self): + return deref(self._this).dayhoff + @property def ksize(self): return deref(self._this).ksize @@ -247,6 +260,9 @@ cdef class MinHash(object): def add_hash(self, uint64_t h): deref(self._this).add_hash(h) + def translate_codon(self, codon): + return deref(self._this).translate_codon(to_bytes(codon)) + def count_common(self, MinHash other): return deref(self._this).count_common(deref(other._this)) @@ -255,7 +271,8 @@ cdef class MinHash(object): raise ValueError('new sample n is higher than current sample n') a = MinHash(new_num, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, 0) if self.track_abundance: a.set_abundances(self.get_mins(with_abundance=True)) @@ -286,7 +303,8 @@ cdef class MinHash(object): new_max_hash = get_max_hash_for_scaled(new_num) a = MinHash(0, deref(self._this).ksize, - deref(self._this).is_protein, self.track_abundance, + deref(self._this).is_protein, deref(self._this).dayhoff, + self.track_abundance, deref(self._this).seed, new_max_hash) if self.track_abundance: a.set_abundances(self.get_mins(with_abundance=True)) @@ -307,6 +325,7 @@ cdef class MinHash(object): combined_mh = new KmerMinAbundance(num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, deref(self._this).seed, deref(self._this).max_hash) @@ -314,6 +333,7 @@ cdef class MinHash(object): combined_mh = new KmerMinHash(num, deref(self._this).ksize, deref(self._this).is_protein, + deref(self._this).dayhoff, deref(self._this).seed, deref(self._this).max_hash) @@ -424,12 +444,27 @@ cdef class MinHash(object): if not deref(self._this).is_protein: raise ValueError("cannot add amino acid sequence to DNA MinHash!") - for i in range(0, len(sequence) - ksize + 1): - deref(self._this).add_word(to_bytes(sequence[i:i + ksize])) + aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1)) + if not self.dayhoff: + for aa_kmer in aa_kmers: + deref(self._this).add_word(to_bytes(aa_kmer)) + else: + for aa_kmer in aa_kmers: + dayhoff_kmer = '' + for aa in aa_kmer: + dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa)) + dayhoff_kmer += dayhoff_letter + # dayhoff_kmer = ''.join( for aa in aa_kmer) + deref(self._this).add_word(to_bytes(dayhoff_kmer)) def is_molecule_type(self, molecule): if molecule.upper() == 'DNA' and not self.is_protein: return True - if molecule == 'protein' and self.is_protein: - return True + if self.is_protein: + if self.dayhoff: + if molecule == 'dayhoff': + return True + else: + if molecule == 'protein': + return True return False diff --git a/sourmash/command_compute.py b/sourmash/command_compute.py new file mode 100644 index 0000000000..97399bf0ee --- /dev/null +++ b/sourmash/command_compute.py @@ -0,0 +1,618 @@ +""" +Functions implementing the 'compute' command and related functions. +""" +from __future__ import print_function, division, absolute_import + +import argparse +import glob +import os +import os.path +import sys +import random +import itertools +import time +import collections +from collections import defaultdict +from functools import partial +import numpy as np + +import screed +from .sourmash_args import SourmashArgumentParser +from . import DEFAULT_SEED, MinHash +from . import signature as sig +from . import signature_json +from . import sourmash_args +from . import np_utils +from .logging import notify, error, set_quiet, debug + +DEFAULT_COMPUTE_K = '21,31,51' +from .commands import DEFAULT_N +DEFAULT_LINE_COUNT = 1500 + + +def compute(args): + """Compute the signature for one or more files. + + Use cases: + sourmash compute multiseq.fa => multiseq.fa.sig, etc. + sourmash compute genome.fa --singleton => genome.fa.sig + sourmash compute file1.fa file2.fa -o file.sig + => creates one output file file.sig, with one signature for each + input file. + sourmash compute file1.fa file2.fa --merge merged -o file.sig + => creates one output file file.sig, with all sequences from + file1.fa and file2.fa combined into one signature. + """ + parser = SourmashArgumentParser() + parser.add_argument('filenames', nargs='+', + help='file(s) of sequences') + + sourmash_args.add_construct_moltype_args(parser) + + parser.add_argument('-q', '--quiet', action='store_true', + help='suppress non-error output') + parser.add_argument('--input-is-protein', action='store_true', + help='Consume protein sequences - no translation needed.') + parser.add_argument('-k', '--ksizes', + default=DEFAULT_COMPUTE_K, + help='comma-separated list of k-mer sizes (default: %(default)s)') + parser.add_argument('-n', '--num-hashes', type=int, + default=DEFAULT_N, + help='number of hashes to use in each sketch (default: %(default)i)') + parser.add_argument('--check-sequence', action='store_true', + help='complain if input sequence is invalid (default: False)') + parser.add_argument('-f', '--force', action='store_true', + help='recompute signatures even if the file exists (default: False)') + parser.add_argument('-o', '--output', type=argparse.FileType('wt'), + help='output computed signatures to this file') + parser.add_argument('--singleton', action='store_true', + help='compute a signature for each sequence record individually (default: False)') + parser.add_argument('--merge', '--name', type=str, default='', metavar="MERGED", + help="merge all input files into one signature named this") + parser.add_argument('--name-from-first', action='store_true', + help="name the signature generated from each file after the first record in the file (default: False)") + parser.add_argument('--input-is-10x', action='store_true', + help="Input is 10x single cell output folder (default: False)") + parser.add_argument('--count-valid-reads', default=0, type=int, + help="For 10x input only (i.e input-is-10x flag is True), " + "A barcode is only considered a valid barcode read " + "and its signature is written if number of umis are greater " + "than count-valid-reads. It is used to weed out cell barcodes " + "with few umis that might have been due to false rna enzyme reactions") + parser.add_argument('--write-barcode-meta-csv', type=str, + help="For 10x input only (i.e input-is-10x flag is True), for each of the unique barcodes, " + "Write to a given path, number of reads and number of umis per barcode.") + parser.add_argument('-p', '--processes', default=2, type=int, + help='For 10x input only (i.e input-is-10x flag is True, ' + 'Number of processes to use for reading 10x bam file') + parser.add_argument('--save-fastas', type=str, + help='For 10x input only (i.e input-is-10x flag is True), ' + 'save merged fastas for all the unique barcodes to {CELL_BARCODE}.fasta ' + 'save by default in the same directory from which the command is run') + parser.add_argument('--line-count', type=int, + help='For 10x input only (i.e input-is-10x flag is True), line count for each bam shard', + default=DEFAULT_LINE_COUNT) + parser.add_argument('--track-abundance', action='store_true', + help='track k-mer abundances in the generated signature (default: False)') + parser.add_argument('--scaled', type=float, default=0, + help='choose number of hashes as 1 in FRACTION of input k-mers') + parser.add_argument('--seed', type=int, + help='seed used by MurmurHash (default: 42)', + default=DEFAULT_SEED) + parser.add_argument('--randomize', action='store_true', + help='shuffle the list of input filenames randomly') + parser.add_argument('--license', default='CC0', type=str, + help='signature license. Currently only CC0 is supported.') + parser.add_argument('--rename-10x-barcodes', type=str, + help="Tab-separated file mapping 10x barcode name " + "to new name, e.g. with channel or cell " + "annotation label", required=False) + parser.add_argument('--barcodes-file', type=str, + help="Barcodes file if the input is unfiltered 10x bam file", required=False) + + args = parser.parse_args(args) + set_quiet(args.quiet) + + if args.license != 'CC0': + error('error: sourmash only supports CC0-licensed signatures. sorry!') + sys.exit(-1) + + if args.input_is_protein and args.dna: + notify('WARNING: input is protein, turning off nucleotide hashing') + args.dna = False + args.protein = True + + if args.scaled: + if args.scaled < 1: + error('ERROR: --scaled value must be >= 1') + sys.exit(-1) + if args.scaled != round(args.scaled, 0): + error('ERROR: --scaled value must be integer value') + sys.exit(-1) + if args.scaled >= 1e9: + notify('WARNING: scaled value is nonsensical!? Continuing anyway.') + + if args.num_hashes != 0: + notify('setting num_hashes to 0 because --scaled is set') + args.num_hashes = 0 + + notify('computing signatures for files: {}', ", ".join(args.filenames)) + + if args.randomize: + notify('randomizing file list because of --randomize') + random.shuffle(args.filenames) + + # get list of k-mer sizes for which to compute sketches + ksizes = args.ksizes + if ',' in ksizes: + ksizes = ksizes.split(',') + ksizes = list(map(int, ksizes)) + else: + ksizes = [int(ksizes)] + + notify('Computing signature for ksizes: {}', str(ksizes)) + num_sigs = 0 + if args.dna and args.protein: + notify('Computing both nucleotide and protein signatures.') + num_sigs = 2*len(ksizes) + elif args.dna and args.dayhoff: + notify('Computing both nucleotide and Dayhoff-encoded protein ' + 'signatures.') + num_sigs = 2*len(ksizes) + elif args.dna: + notify('Computing only nucleotide (and not protein) signatures.') + num_sigs = len(ksizes) + elif args.protein: + notify('Computing only protein (and not nucleotide) signatures.') + num_sigs = len(ksizes) + elif args.dayhoff: + notify('Computing only Dayhoff-encoded protein (and not nucleotide) ' + 'signatures.') + num_sigs = len(ksizes) + + if (args.protein or args.dayhoff) and not args.input_is_protein: + bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ] + if bad_ksizes: + error('protein ksizes must be divisible by 3, sorry!') + error('bad ksizes: {}', ", ".join(bad_ksizes)) + sys.exit(-1) + + notify('Computing a total of {} signature(s).', num_sigs) + + if num_sigs == 0: + error('...nothing to calculate!? Exiting!') + sys.exit(-1) + + if args.merge and not args.output: + error("must specify -o with --merge") + sys.exit(-1) + + def make_minhashes(): + seed = args.seed + + # one minhash for each ksize + Elist = [] + for k in ksizes: + if args.protein: + E = MinHash(ksize=k, n=args.num_hashes, + is_protein=True, + dayhoff=False, + track_abundance=args.track_abundance, + scaled=args.scaled, + seed=seed) + Elist.append(E) + if args.dayhoff: + E = MinHash(ksize=k, n=args.num_hashes, + is_protein=True, + dayhoff=True, + track_abundance=args.track_abundance, + scaled=args.scaled, + seed=seed) + Elist.append(E) + if args.dna: + E = MinHash(ksize=k, n=args.num_hashes, + is_protein=False, + dayhoff=False, + track_abundance=args.track_abundance, + scaled=args.scaled, + seed=seed) + Elist.append(E) + return Elist + + def add_seq(Elist, seq, input_is_protein, check_sequence): + for E in Elist: + if input_is_protein: + E.add_protein(seq) + else: + E.add_sequence(seq, not check_sequence) + + def build_siglist(Elist, filename, name=None): + return [ sig.SourmashSignature(E, filename=filename, + name=name) for E in Elist ] + + def iter_split(string, sep=None): + """Split a string by the given separator and + return the results in a generator""" + sep = sep or ' ' + groups = itertools.groupby(string, lambda s: s != sep) + return (''.join(g) for k, g in groups if k) + + def fasta_to_sig_record(index): + """Convert fasta to sig record""" + if umi_filter: + return filtered_umi_to_sig(index) + else: + return unfiltered_umi_to_sig(index) + + def unfiltered_umi_to_sig(index): + """Returns signature records across fasta files for a unique barcode""" + # Initializing time + startt = time.time() + + # Getting all fastas for a given barcode + # from different shards + single_barcode_fastas = all_fastas_sorted[index] + + count = 0 + # Iterating through fasta files for single barcode from different + # fastas + for fasta in iter_split(single_barcode_fastas, ","): + + # Initializing the fasta file to write + # all the sequences from all bam shards to + if count == 0: + unique_fasta_file = os.path.basename(fasta) + if args.save_fastas: + f = open(unique_fasta_file, "w") + + # Add sequence + for record in screed.open(fasta): + sequence = record.sequence + add_seq(Elist, sequence, + args.input_is_protein, args.check_sequence) + + # Updating the fasta file with each of the sequences + if args.save_fastas: + f.write(">{}\n{}".format(filename, record.sequence)) + + # Delete fasta file in tmp folder + if os.path.exists(fasta): + os.unlink(fasta) + + count += 1 + + # Close the opened fasta file + if args.save_fastas: + f.close() + + # Build signature records + barcode_name = unique_fasta_file.replace(".fasta", "") + siglist = build_siglist( + Elist, + os.path.join(args.filenames[0], unique_fasta_file), + name=barcode_name) + records = signature_json.add_meta_save(siglist) + + notify( + "time taken to build signature records for a barcode {} is {:.5f} seconds", + unique_fasta_file, time.time() - startt, end='\r') + return records + + def filtered_umi_to_sig(index): + """Returns signature records for all the fasta files for a unique + barcode, only if it has more than count-valid-reads number of umis.""" + + # Initializing time + startt = time.time() + + # Getting all fastas for a given barcode + # from different shards + single_barcode_fastas = all_fastas_sorted[index] + + debug("calculating umi counts", end="\r", flush=True) + # Tracking UMI Counts + umis = defaultdict(int) + # Iterating through fasta files for single barcode from different + # fastas + for fasta in iter_split(single_barcode_fastas, ","): + # calculate unique umi, sequence counts + for record in screed.open(fasta): + umis[record.name] += record.sequence.count(delimiter) + + if args.write_barcode_meta_csv: + unique_fasta_file = os.path.basename(fasta) + unique_fasta_file = unique_fasta_file.replace(".fasta", "_meta.txt") + with open(unique_fasta_file, "w") as f: + f.write("{} {}".format(len(umis), sum(list(umis.values())))) + + debug("Completed tracking umi counts", end="\r", flush=True) + if len(umis) < args.count_valid_reads: + return [] + count = 0 + for fasta in iter_split(single_barcode_fastas, ","): + + # Initializing fasta file to save the sequence to + if count == 0: + unique_fasta_file = os.path.basename(fasta) + if args.save_fastas: + f = open(unique_fasta_file, "w") + + # Add sequences of barcodes with more than count-valid-reads umis + for record in screed.open(fasta): + sequence = record.sequence + add_seq(Elist, sequence, + args.input_is_protein, args.check_sequence) + + # Updating the fasta file with each of the sequences + if args.save_fastas: + f.write(">{}\n{}".format(filename, record.sequence)) + + # Delete fasta file in tmp folder + if os.path.exists(fasta): + os.unlink(fasta) + count += 1 + + debug("Added sequences of unique barcode,umi to Elist", end="\r", + flush=True) + # Close the opened fasta file + if args.save_fastas: + f.close() + # Build signature records + barcode_name = unique_fasta_file.replace(".fasta", "") + siglist = build_siglist( + Elist, + os.path.join(args.filenames[0], unique_fasta_file), + name=barcode_name) + records = signature_json.add_meta_save(siglist) + notify( + "time taken to build signature records for a barcode {} is {:.5f} seconds", + unique_fasta_file, time.time() - startt, end="\r", flush=True) + return records + + def save_siglist(siglist, output_fp, filename=None): + # save! + if output_fp: + sigfile_name = args.output.name + sig.save_signatures(siglist, args.output) + else: + if filename is None: + raise Exception("internal error, filename is None") + with open(filename, 'w') as fp: + sigfile_name = filename + sig.save_signatures(siglist, fp) + notify( + 'saved signature(s) to {}. Note: signature license is CC0.', + sigfile_name, + end="\r") + + def calculate_chunksize(length, num_jobs): + chunksize, extra = divmod(length, num_jobs) + if extra: + chunksize += 1 + return chunksize + + if args.input_is_10x: + all_fastas_sorted = [] + delimiter = "X" + umi_filter = True if args.count_valid_reads != 0 else False + Elist = make_minhashes() + CELL_BARCODE = "CELL_BARCODE" + UMI_COUNT = "UMI_COUNT" + READ_COUNT = "READ_COUNT" + + if args.track_abundance: + notify('Tracking abundance of input k-mers.') + + if not args.merge: + if args.output: + siglist = [] + + for filename in args.filenames: + sigfile = os.path.basename(filename) + '.sig' + if not args.output and os.path.exists(sigfile) and not \ + args.force: + notify('skipping {} - already done', filename) + continue + + if args.singleton: + siglist = [] + for n, record in enumerate(screed.open(filename)): + # make minhashes for each sequence + Elist = make_minhashes() + add_seq(Elist, record.sequence, + args.input_is_protein, args.check_sequence) + + siglist += build_siglist(Elist, filename, name=record.name) + + notify('calculated {} signatures for {} sequences in {}', + len(siglist), n + 1, filename) + elif args.input_is_10x: + from .tenx import (read_barcodes_file, shard_bam_file, + bam_to_fasta) + from pathos import multiprocessing + + # Initializing time + startt = time.time() + + # Setting barcodes file, some 10x files don't have a filtered + # barcode file + if args.barcodes_file is not None: + barcodes = read_barcodes_file(args.barcodes_file) + else: + barcodes = None + + # Shard bam file to smaller bam file + notify('... reading bam file from {}', filename) + n_jobs = args.processes + filenames, mmap_file = np_utils.to_memmap(np.array( + shard_bam_file(filename, args.line_count, os.getcwd()))) + + # Create a per-cell fasta generator of sequences + # If the reads should be filtered by barcodes and umis + # umis are saved in fasta file as record name and name of + # the fasta file is the barcode + func = partial( + bam_to_fasta, + barcodes, + args.rename_10x_barcodes, + delimiter, + umi_filter) + + length_sharded_bam_files = len(filenames) + chunksize = calculate_chunksize(length_sharded_bam_files, + n_jobs) + pool = multiprocessing.Pool(processes=n_jobs) + notify( + "multiprocessing pool processes {} and chunksize {} calculated", n_jobs, chunksize) + # All the fastas are stored in a string instead of a list + # This saves memory per element of the list by 8 bits + # If we have unique barcodes in the order of 10^6 before + # filtering that would result in a huge list if each barcode + # is saved as a separate element, hence the string + all_fastas = "," .join(itertools.chain(*( + pool.imap( + lambda x: func(x.encode('utf-8')), filenames, chunksize=chunksize)))) + pool.close() + pool.join() + + # clean up the memmap and sharded intermediary bam files + [os.unlink(file) for file in filenames if os.path.exists(file)] + del filenames + os.unlink(mmap_file) + notify("Deleted intermediary bam and memmap files") + + # Build a dictionary with each unique barcode as key and + # their fasta files from different shards + fasta_files_dict = collections.OrderedDict() + for fasta in iter_split(all_fastas, ","): + barcode = os.path.basename(fasta).replace(".fasta", "") + value = fasta_files_dict.get(barcode, "") + fasta_files_dict[barcode] = value + fasta + "," + + # Cleaning up to retrieve memory from unused large variables + del all_fastas + notify("Created fasta_files_dict") + + # Find unique barcodes + all_fastas_sorted = list(fasta_files_dict.values()) + del fasta_files_dict + unique_barcodes = len(all_fastas_sorted) + notify("Found {} unique barcodes", unique_barcodes) + + # For each barcode, add all their sequences, find + # minhash and convert them to signature The below + # fasta_to_sig_record also takes into consideration if + # umi_filter is True and acts accordingly to create + # records for barcodes with considerable number of + # umis as provided in count-valid-reads This allows us + # to save unique barcodes with valid reads and + # significant umis, otherwise signature files save + # every barcode even with one umi/one read resulting + # in passing every alignment in bam file without file + # i.e resulting in a signature file in GB, whereas the + # expected .sig file should be in the order of MB + + pool = multiprocessing.Pool(processes=n_jobs) + chunksize = calculate_chunksize(unique_barcodes, n_jobs) + notify("Pooled {} and chunksize {} mapped", n_jobs, chunksize) + + records = list(itertools.chain(*pool.imap( + lambda index: fasta_to_sig_record(index), + range(unique_barcodes), + chunksize=chunksize))) + + pool.close() + pool.join() + + if args.write_barcode_meta_csv: + + barcodes_meta_txts = glob.glob("*_meta.txt") + + with open(args.write_barcode_meta_csv, "w") as fp: + fp.write("{},{},{}".format(CELL_BARCODE, UMI_COUNT, + READ_COUNT)) + fp.write('\n') + for barcode_meta_txt in barcodes_meta_txts: + with open(barcode_meta_txt, 'r') as f: + umi_count, read_count = f.readline().split() + umi_count = int(umi_count) + read_count = int(read_count) + + barcode_name = barcode_meta_txt.replace('_meta.txt', '') + fp.write("{},{},{}\n".format(barcode_name, + umi_count, + read_count)) + + os.unlink(barcode_meta_txt) + + filtered_barcode_signatures = len(records) + notify("Signature records created for {}", + filtered_barcode_signatures) + + # Save the signature records in .sig file + if args.output is not None: + signature_json.write_records_to_json(records, args.output) + else: + signature_json.write_records_to_json(records, open(sigfile, "wt")) + del records + + notify("time taken to save {} signature records for 10x folder is {:.5f} seconds", + filtered_barcode_signatures, time.time() - startt) + else: + # make minhashes for the whole file + Elist = make_minhashes() + + # consume & calculate signatures + notify('... reading sequences from {}', filename) + name = None + for n, record in enumerate(screed.open(filename)): + if n % 10000 == 0: + if n: + notify('\r...{} {}', filename, n, end='') + elif args.name_from_first: + name = record.name + + add_seq(Elist, record.sequence, + args.input_is_protein, args.check_sequence) + + notify('...{} {} sequences', filename, n, end='') + + sigs = build_siglist(Elist, filename, name) + if args.output: + siglist += sigs + else: + siglist = sigs + + notify('calculated {} signatures for {} sequences in {}', + len(sigs), n + 1, filename) + + if not args.output and not args.input_is_10x: + save_siglist(siglist, args.output, sigfile) + + if args.output and not args.input_is_10x: + save_siglist(siglist, args.output, sigfile) + else: # single name specified - combine all + # make minhashes for the whole file + Elist = make_minhashes() + + total_seq = 0 + for filename in args.filenames: + # consume & calculate signatures + notify('... reading sequences from {}', filename) + + for n, record in enumerate(screed.open(filename)): + if n % 10000 == 0 and n: + notify('\r... {} {}', filename, n, end='') + + add_seq(Elist, record.sequence, + args.input_is_protein, args.check_sequence) + notify('... {} {} sequences', filename, n + 1) + + total_seq += n + 1 + + siglist = build_siglist(Elist, filename, name=args.merge) + notify('calculated {} signatures for {} sequences taken from {} files', + len(siglist), total_seq, len(args.filenames)) + + # at end, save! + save_siglist(siglist, args.output) diff --git a/sourmash/commands.py b/sourmash/commands.py index 9c4db3f408..d513d533a3 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -1,28 +1,30 @@ -from __future__ import print_function, division +""" +Functions implementing the main command-line subcommands. +""" +from __future__ import print_function, division, absolute_import import argparse import csv -import itertools -import multiprocessing import os import os.path import sys -import random import screed +from .compare import compare_all_pairs from .sourmash_args import SourmashArgumentParser -from . import DEFAULT_SEED, MinHash, load_sbt_index, create_sbt_index +from . import MinHash, load_sbt_index, create_sbt_index from . import signature as sig from . import sourmash_args from .logging import notify, error, print_results, set_quiet from .sbtmh import SearchMinHashesFindBest, SigLeaf from .sourmash_args import DEFAULT_LOAD_K -DEFAULT_COMPUTE_K = '21,31,51' DEFAULT_N = 500 WATERMARK_SIZE = 10000 +from .command_compute import compute + def info(args): "Report sourmash version + version of installed dependencies." @@ -46,310 +48,6 @@ def info(args): notify('screed version {}', screed.__version__) notify('- loaded from path: {}', os.path.dirname(screed.__file__)) -def compute(args): - """Compute the signature for one or more files. - - Use cases: - sourmash compute multiseq.fa => multiseq.fa.sig, etc. - sourmash compute genome.fa --singleton => genome.fa.sig - sourmash compute file1.fa file2.fa -o file.sig - => creates one output file file.sig, with one signature for each - input file. - sourmash compute file1.fa file2.fa --merge merged -o file.sig - => creates one output file file.sig, with all sequences from - file1.fa and file2.fa combined into one signature. - """ - parser = SourmashArgumentParser() - parser.add_argument('filenames', nargs='+', - help='file(s) of sequences') - - sourmash_args.add_construct_moltype_args(parser) - - parser.add_argument('-q', '--quiet', action='store_true', - help='suppress non-error output') - parser.add_argument('--input-is-protein', action='store_true', - help='Consume protein sequences - no translation needed.') - parser.add_argument('-k', '--ksizes', - default=DEFAULT_COMPUTE_K, - help='comma-separated list of k-mer sizes (default: %(default)s)') - parser.add_argument('-n', '--num-hashes', type=int, - default=DEFAULT_N, - help='number of hashes to use in each sketch (default: %(default)i)') - parser.add_argument('--check-sequence', action='store_true', - help='complain if input sequence is invalid (default: False)') - parser.add_argument('-f', '--force', action='store_true', - help='recompute signatures even if the file exists (default: False)') - parser.add_argument('-o', '--output', type=argparse.FileType('wt'), - help='output computed signatures to this file') - parser.add_argument('--singleton', action='store_true', - help='compute a signature for each sequence record individually (default: False)') - parser.add_argument('--merge', '--name', type=str, default='', metavar="MERGED", - help="merge all input files into one signature named this") - parser.add_argument('--name-from-first', action='store_true', - help="name the signature generated from each file after the first record in the file (default: False)") - parser.add_argument('--input-is-10x', action='store_true', - help="Input is 10x single cell output folder (default: False)") - parser.add_argument('-p', '--processes', default=2, type=int, - help='Number of processes to use for reading 10x bam file') - parser.add_argument('--track-abundance', action='store_true', - help='track k-mer abundances in the generated signature (default: False)') - parser.add_argument('--scaled', type=float, default=0, - help='choose number of hashes as 1 in FRACTION of input k-mers') - parser.add_argument('--seed', type=int, - help='seed used by MurmurHash (default: 42)', - default=DEFAULT_SEED) - parser.add_argument('--randomize', action='store_true', - help='shuffle the list of input filenames randomly') - parser.add_argument('--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.') - - args = parser.parse_args(args) - set_quiet(args.quiet) - - if args.license != 'CC0': - error('error: sourmash only supports CC0-licensed signatures. sorry!') - sys.exit(-1) - - if args.input_is_protein and args.dna: - notify('WARNING: input is protein, turning off nucleotide hashing') - args.dna = False - args.protein = True - - if args.scaled: - if args.scaled < 1: - error('ERROR: --scaled value must be >= 1') - sys.exit(-1) - if args.scaled != round(args.scaled, 0): - error('ERROR: --scaled value must be integer value') - sys.exit(-1) - if args.scaled >= 1e9: - notify('WARNING: scaled value is nonsensical!? Continuing anyway.') - - if args.num_hashes != 0: - notify('setting num_hashes to 0 because --scaled is set') - args.num_hashes = 0 - - notify('computing signatures for files: {}', ", ".join(args.filenames)) - - if args.randomize: - notify('randomizing file list because of --randomize') - random.shuffle(args.filenames) - - # get list of k-mer sizes for which to compute sketches - ksizes = args.ksizes - if ',' in ksizes: - ksizes = ksizes.split(',') - ksizes = list(map(int, ksizes)) - else: - ksizes = [int(ksizes)] - - notify('Computing signature for ksizes: {}', str(ksizes)) - - num_sigs = 0 - if args.dna and args.protein: - notify('Computing both nucleotide and protein signatures.') - num_sigs = 2*len(ksizes) - elif args.dna: - notify('Computing only nucleotide (and not protein) signatures.') - num_sigs = len(ksizes) - elif args.protein: - notify('Computing only protein (and not nucleotide) signatures.') - num_sigs = len(ksizes) - - if args.protein and not args.input_is_protein: - bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ] - if bad_ksizes: - error('protein ksizes must be divisible by 3, sorry!') - error('bad ksizes: {}', ", ".join(bad_ksizes)) - sys.exit(-1) - - notify('Computing a total of {} signature(s).', num_sigs) - - if num_sigs == 0: - error('...nothing to calculate!? Exiting!') - sys.exit(-1) - - if args.merge and not args.output: - error("must specify -o with --merge") - sys.exit(-1) - - def make_minhashes(): - seed = args.seed - - # one minhash for each ksize - Elist = [] - for k in ksizes: - if args.protein: - E = MinHash(ksize=k, n=args.num_hashes, - is_protein=True, - track_abundance=args.track_abundance, - scaled=args.scaled, - seed=seed) - Elist.append(E) - if args.dna: - E = MinHash(ksize=k, n=args.num_hashes, - is_protein=False, - track_abundance=args.track_abundance, - scaled=args.scaled, - seed=seed) - Elist.append(E) - return Elist - - def add_seq(Elist, seq, input_is_protein, check_sequence): - for E in Elist: - if input_is_protein: - E.add_protein(seq) - else: - E.add_sequence(seq, not check_sequence) - - def build_siglist(Elist, filename, name=None): - return [ sig.SourmashSignature(E, filename=filename, - name=name) for E in Elist ] - - def save_siglist(siglist, output_fp, filename=None): - # save! - if output_fp: - sig.save_signatures(siglist, args.output) - else: - if filename is None: - raise Exception("internal error, filename is None") - with open(filename, 'w') as fp: - sig.save_signatures(siglist, fp) - notify('saved {} signature(s). Note: signature license is CC0.'.format(len(siglist))) - - def maybe_add_barcode(barcode, cell_seqs): - if barcode not in cell_seqs: - cell_seqs[barcode] = make_minhashes() - - def maybe_add_alignment(alignment, cell_seqs, args, barcodes): - high_quality_mapping = alignment.mapq == 255 - good_barcode = 'CB' in alignment.tags and \ - alignment.get_tag('CB') in barcodes - good_umi = 'UB' in alignment.tags - - pass_qc = high_quality_mapping and good_barcode and \ - good_umi - if pass_qc: - barcode = alignment.get_tag('CB') - # if this isn't marked a duplicate, count it as a UMI - if not alignment.is_duplicate: - maybe_add_barcode(barcode, cell_seqs) - add_seq(cell_seqs[barcode], alignment.seq, - args.input_is_protein, args.check_sequence) - - if args.track_abundance: - notify('Tracking abundance of input k-mers.') - - if not args.merge: - if args.output: - siglist = [] - - for filename in args.filenames: - sigfile = os.path.basename(filename) + '.sig' - if not args.output and os.path.exists(sigfile) and not \ - args.force: - notify('skipping {} - already done', filename) - continue - - if args.singleton: - siglist = [] - for n, record in enumerate(screed.open(filename)): - # make minhashes for each sequence - Elist = make_minhashes() - add_seq(Elist, record.sequence, - args.input_is_protein, args.check_sequence) - - siglist += build_siglist(Elist, filename, name=record.name) - - notify('calculated {} signatures for {} sequences in {}', - len(siglist), n + 1, filename) - elif args.input_is_10x: - import pathos.multiprocessing as mp - from .tenx import read_10x_folder - - barcodes, bam_file = read_10x_folder(filename) - manager = multiprocessing.Manager() - - cell_seqs = manager.dict() - - notify('... reading sequences from {}', filename) - - pool = mp.Pool(processes=args.processes) - pool.map(lambda x: maybe_add_alignment(x, cell_seqs, args, barcodes), bam_file) - # for n, alignment in enumerate(bam_file): - # if n % 10000 == 0: - # if n: - # notify('\r...{} {}', filename, n, end='') - # maybe_add_alignment(alignment, cell_seqs) - - cell_signatures = [ - build_siglist(seqs, filename=filename, name=barcode) - for barcode, seqs in cell_seqs.items()] - sigs = list(itertools.chain(*cell_signatures)) - if args.output: - siglist += sigs - else: - siglist = sigs - - else: - # make minhashes for the whole file - Elist = make_minhashes() - - # consume & calculate signatures - notify('... reading sequences from {}', filename) - name = None - for n, record in enumerate(screed.open(filename)): - if n % 10000 == 0: - if n: - notify('\r...{} {}', filename, n, end='') - elif args.name_from_first: - name = record.name - - add_seq(Elist, record.sequence, - args.input_is_protein, args.check_sequence) - - notify('...{} {} sequences', filename, n, end='') - - sigs = build_siglist(Elist, filename, name) - if args.output: - siglist += sigs - else: - siglist = sigs - - notify('calculated {} signatures for {} sequences in {}', - len(sigs), n + 1, filename) - - if not args.output: - save_siglist(siglist, args.output, sigfile) - - if args.output: - save_siglist(siglist, args.output, sigfile) - else: # single name specified - combine all - # make minhashes for the whole file - Elist = make_minhashes() - - total_seq = 0 - for filename in args.filenames: - # consume & calculate signatures - notify('... reading sequences from {}', filename) - - for n, record in enumerate(screed.open(filename)): - if n % 10000 == 0 and n: - notify('\r... {} {}', filename, n, end='') - - add_seq(Elist, record.sequence, - args.input_is_protein, args.check_sequence) - notify('... {} {} sequences', filename, n + 1) - - total_seq += n + 1 - - siglist = build_siglist(Elist, filename, name=args.merge) - notify('calculated {} signatures for {} sequences taken from {} files', - len(siglist), total_seq, len(args.filenames)) - - # at end, save! - save_siglist(siglist, args.output) - def compare(args): "Compare multiple signature files and create a distance matrix." @@ -366,6 +64,8 @@ def compare(args): help='compare all signatures underneath directories.') parser.add_argument('--csv', type=argparse.FileType('w'), help='save matrix in CSV format (with column headers)') + parser.add_argument('-p', '--processes', type=int, + help='Number of processes to use to calculate similarity') parser.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') args = parser.parse_args(args) @@ -440,31 +140,22 @@ def compare(args): notify('') # build the distance matrix - D = numpy.zeros([len(siglist), len(siglist)]) numpy.set_printoptions(precision=3, suppress=True) # do all-by-all calculation - labeltext = [] - for i, E in enumerate(siglist): - for j, E2 in enumerate(siglist): - if i < j: - continue - similarity = E.similarity(E2, args.ignore_abundance) - D[i][j] = similarity - D[j][i] = similarity - - labeltext.append(E.name()) + labeltext = [item.name() for item in siglist] + similarity = compare_all_pairs(siglist, args.ignore_abundance, + n_jobs=args.processes) if len(siglist) < 30: for i, E in enumerate(siglist): # for small matrices, pretty-print some output name_num = '{}-{}'.format(i, E.name()) if len(name_num) > 20: name_num = name_num[:17] + '...' - print_results('{:20s}\t{}'.format(name_num, D[i, :, ],)) - - print_results('min similarity in matrix: {:.3f}', numpy.min(D)) + print_results('{:20s}\t{}'.format(name_num, similarity[i, :, ],)) + print_results('min similarity in matrix: {:.3f}', numpy.min(similarity)) # shall we output a matrix? if args.output: labeloutname = args.output + '.labels.txt' @@ -474,7 +165,7 @@ def compare(args): notify('saving distance matrix to: {}', args.output) with open(args.output, 'wb') as fp: - numpy.save(fp, D) + numpy.save(fp, similarity) # output CSV? if args.csv: @@ -484,7 +175,7 @@ def compare(args): for i in range(len(labeltext)): y = [] for j in range(len(labeltext)): - y.append('{}'.format(D[i][j])) + y.append('{}'.format(similarity[i][j])) args.csv.write(','.join(y) + '\n') @@ -516,6 +207,7 @@ def plot(args): help="random seed for --subsample; default=1") parser.add_argument('-f', '--force', action='store_true', help='forcibly plot non-distance matrices') + parser.add_argument('--output-dir', help='directory for output plots') args = parser.parse_args(args) @@ -552,6 +244,14 @@ def plot(args): else: hist_out += '.png' + # output to a different directory? + if args.output_dir: + if not os.path.isdir(args.output_dir): + os.mkdir(args.output_dir) + dendrogram_out = os.path.join(args.output_dir, dendrogram_out) + matrix_out = os.path.join(args.output_dir, matrix_out) + hist_out = os.path.join(args.output_dir, hist_out) + # make the histogram notify('saving histogram of matrix values => {}', hist_out) fig = pylab.figure(figsize=(8,5)) @@ -575,10 +275,9 @@ def plot(args): np_idx = numpy.array(sample_idx) D = D[numpy.ix_(np_idx, np_idx)] labeltext = [ labeltext[idx] for idx in sample_idx ] - ### do clustering Y = sch.linkage(D, method='single') - Z1 = sch.dendrogram(Y, orientation='right', labels=labeltext) + sch.dendrogram(Y, orientation='right', labels=labeltext) fig.savefig(dendrogram_out) notify('wrote dendrogram to: {}', dendrogram_out) @@ -636,7 +335,8 @@ def import_csv(args): def dump(args): parser = SourmashArgumentParser() parser.add_argument('filenames', nargs='+') - parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_LOAD_K, help='k-mer size (default: %(default)i)') + parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_LOAD_K, + help='k-mer size (default: %(default)i)') args = parser.parse_args(args) for filename in args.filenames: @@ -662,7 +362,6 @@ def sbt_combine(args): sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) - moltype = sourmash_args.calculate_moltype(args) inp_files = list(args.sbts) notify('combining {} SBTs', len(inp_files)) @@ -693,7 +392,7 @@ def index(args): parser.add_argument('-d', '--n_children', type=int, default=2, help='Number of children for internal nodes') parser.add_argument('--traverse-directory', action='store_true', - help='load all signatures underneath this directory.') + help='load all signatures underneath any directories.') parser.add_argument('--append', action='store_true', default=False, help='add signatures to an existing SBT.') parser.add_argument('-x', '--bf-size', type=float, default=1e5, @@ -1110,8 +809,15 @@ def gather(args): outname = args.output_unassigned.name notify('saving unassigned hashes to "{}"', outname) - e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash) - e.add_many(next_query.minhash.get_mins()) + with_abundance = next_query.minhash.track_abundance + e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash, + track_abundance=with_abundance) + if with_abundance: + abunds = next_query.minhash.get_mins(with_abundance=True) + e.set_abundances(abunds) + else: + e.add_many(next_query.minhash.get_mins()) + sig.save_signatures([ sig.SourmashSignature(e) ], args.output_unassigned) @@ -1310,9 +1016,15 @@ def watch(args): if args.dna: moltype = 'DNA' is_protein = False - else: + dayhoff = False + elif args.protein: moltype = 'protein' is_protein = True + dayhoff = False + else: + moltype = 'dayhoff' + is_protein = True + dayhoff = True tree = load_sbt_index(args.sbt_name) @@ -1323,7 +1035,7 @@ def watch(args): tree_mh = leaf.data.minhash ksize = tree_mh.ksize - E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein) + E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff) streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) notify('Computing signature for k={}, {} from stdin', ksize, moltype) diff --git a/sourmash/compare.py b/sourmash/compare.py new file mode 100644 index 0000000000..7b30a3a7fe --- /dev/null +++ b/sourmash/compare.py @@ -0,0 +1,208 @@ +"""Functionality for comparing many signatures, used in sourmash compare.""" + +import itertools +from functools import partial +import time +import multiprocessing +import numpy as np + +from .logging import notify +from sourmash.np_utils import to_memmap + + +def compare_serial(siglist, ignore_abundance, downsample=False): + """Compare all combinations of signatures and return a matrix + of similarities. Processes combinations serially on a single + process. Best to use when there is few signatures. + + :param list siglist: list of signatures to compare + :param boolean ignore_abundance + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate a distance metric + based on the cosine similarity. + :param boolean downsample by max_hash if True + :return: np.array similarity matrix + """ + + n = len(siglist) + + # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A) + iterator = itertools.combinations(range(n), 2) + + similarities = np.ones((n, n)) + + for i, j in iterator: + similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance, downsample) + + return similarities + + +def similarity(sig1, sig2, ignore_abundance, downsample): + """Compute similarity with the other MinHash signature. + This function is separated from the SourmashSignature to + avoid pickling the whole class during the pool map + + :param sig1 first signature + :param sig2 other signature to compare with + :param boolean ignore_abundance + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate a distance metric + based on the cosine similarity. + :param boolean downsample by max_hash if True + :return: float similarity of the two signatures + """ + + try: + sig = sig1.minhash.similarity(sig2.minhash, ignore_abundance) + return sig + except ValueError as e: + if 'mismatch in max_hash' in str(e) and downsample: + xx = sig1.minhash.downsample_max_hash(sig2.minhash) + yy = sig2.minhash.downsample_max_hash(sig1.minhash) + sig = similarity(xx, yy, ignore_abundance) + return sig + else: + raise + + +def similarity_args_unpack(args, ignore_abundance, downsample): + """Helper function to unpack the arguments. Written to use in pool.imap as it + can only be given one argument.""" + return similarity(*args, ignore_abundance=ignore_abundance, downsample=downsample) + + +def get_similarities_at_index(index, ignore_abundance, downsample, siglist): + """Returns similarities of all the combinations of signature at index in the + siglist with the rest of the indices starting at index + 1. Doesn't redundantly + calculate signatures with all the other indices prior to index - 1 + + :param int index: generate masks from this image + :param boolean ignore_abundance + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate a distance metric + based on the cosine similarity. + :param boolean downsample by max_hash if True + :param siglist list of signatures + :return: list of similarities for the combinations of signature at index with + rest of the signatures from index+1 + """ + startt = time.time() + sig_iterator = itertools.product([siglist[index]], siglist[index + 1:]) + func = partial(similarity_args_unpack, + ignore_abundance=ignore_abundance, + downsample=downsample) + similarity_list = list(map(func, sig_iterator)) + notify( + "comparison for index {} done in {:.5f} seconds", + index, + time.time() - startt, + end='\r') + return similarity_list + + +def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): + """Compare all combinations of signatures and return a matrix + of similarities. Processes combinations parallely on number of processes + given by n_jobs + + :param list siglist: list of signatures to compare + :param boolean ignore_abundance + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate a distance metric + based on the cosine similarity. + :param boolean downsample by max_hash if True + :param int n_jobs number of processes to run the similarity calculations on + :return: np.array similarity matrix + """ + # Starting time - calculate time to keep track in case of lengthy siglist + start_initial = time.time() + + # Create a memory map of the siglist using numpy to avoid memory burden + # while accessing small parts in it + siglist, _ = to_memmap(np.array(siglist)) + notify("Created memmapped siglist") + + # Check that length of combinations can result in a square similarity matrix + length_siglist = len(siglist) + + # Initialize with ones in the diagonal as the similarity of a signature with + # itself is one + similarities = np.eye(length_siglist, dtype=np.float64) + memmap_similarities, filename = to_memmap(similarities) + notify("Initialized memmapped similarities matrix") + + # Initialize the function using func.partial with the common arguments like + # siglist, ignore_abundance, downsample, for computing all the signatures + # The only changing parameter that will be mapped from the pool is the index + func = partial( + get_similarities_at_index, + siglist=siglist, + ignore_abundance=ignore_abundance, + downsample=downsample) + notify("Created similarity func") + + # Initialize multiprocess.pool + pool = multiprocessing.Pool(processes=n_jobs) + + # Calculate chunk size, by default pool.imap chunk size is 1 + chunksize, extra = divmod(length_siglist, n_jobs) + if extra: + chunksize += 1 + notify("Calculated chunk size for multiprocessing") + + # This will not generate the results yet, since pool.imap returns a generator + result = pool.imap(func, range(length_siglist), chunksize=chunksize) + notify("Initialized multiprocessing pool.imap") + + # Enumerate and calculate similarities at each of the indices + # and set the results at the appropriate combination coordinate + # locations inside the similarity matrix + for index, l in enumerate(result): + startt = time.time() + col_idx = index + 1 + for idx_condensed, item in enumerate(l): + memmap_similarities[index, col_idx + idx_condensed] = memmap_similarities[idx_condensed + col_idx, index] = item + notify( + "Setting similarities matrix for index {} done in {:.5f} seconds", + index, + time.time() - startt, + end='\r') + notify("Setting similarities completed") + + pool.close() + pool.join() + + notify("Time taken to compare all pairs parallely is {:.5f} seconds ", time.time() - start_initial) + return np.memmap(filename, dtype=np.float64, shape=(length_siglist, length_siglist)) + + +def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None): + """Compare all combinations of signatures and return a matrix + of similarities. Processes combinations either serially or + based on parallely on number of processes given by n_jobs + + :param list siglist: list of signatures to compare + :param boolean ignore_abundance + If the sketches are not abundance weighted, or ignore_abundance=True, + compute Jaccard similarity. + + If the sketches are abundance weighted, calculate a distance metric + based on the cosine similarity. + :param boolean downsample by max_hash if True + :param int n_jobs number of processes to run the similarity calculations on, + if number of jobs is None or 1, compare serially, otherwise parallely. + :return: np.array similarity matrix + """ + if n_jobs is None or n_jobs == 1: + similarities = compare_serial(siglist, ignore_abundance, downsample) + else: + similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs) + return similarities diff --git a/sourmash/kmer_min_hash.hh b/sourmash/kmer_min_hash.hh index 068e3fde6f..a85b5c0c20 100644 --- a/sourmash/kmer_min_hash.hh +++ b/sourmash/kmer_min_hash.hh @@ -2,6 +2,7 @@ #define KMER_MIN_HASH_HH #include +#include #include #include #include @@ -59,13 +60,14 @@ public: const unsigned int num; const unsigned int ksize; const bool is_protein; + const bool dayhoff; const uint32_t seed; const HashIntoType max_hash; CMinHashType mins; - KmerMinHash(unsigned int n, unsigned int k, bool prot, uint32_t s, + KmerMinHash(unsigned int n, unsigned int k, bool prot, bool dyhoff, uint32_t s, HashIntoType mx) - : num(n), ksize(k), is_protein(prot), seed(s), max_hash(mx) { + : num(n), ksize(k), is_protein(prot), dayhoff(dyhoff), seed(s), max_hash(mx) { if (n > 0) { mins.reserve(num + 1); } @@ -82,6 +84,9 @@ public: if (is_protein != other.is_protein) { throw minhash_exception("DNA/prot minhashes cannot be compared"); } + if (dayhoff != other.dayhoff) { + throw minhash_exception("DNA/prot minhashes cannot be compared"); + } if (max_hash != other.max_hash) { throw minhash_exception("mismatch in max_hash; comparison fail"); } @@ -177,19 +182,54 @@ public: } } - std::string _dna_to_aa(const std::string& dna) { - std::string aa; - unsigned int dna_size = (dna.size() / 3) * 3; // floor it - for (unsigned int j = 0; j < dna_size; j += 3) { - std::string codon = dna.substr(j, 3); + std::string translate_codon(std::string& codon) { + std::string residue; + + if (codon.length() >= 2 && codon.length() <= 3){ + // If codon is length 2, pad with an N for ambiguous codon amino acids + if (codon.length() == 2) { + codon += "N"; + } auto translated = _codon_table.find(codon); + if (translated != _codon_table.end()) { // "second" is the element mapped to by the codon - aa += translated -> second; + // Because .find returns an iterator + residue = translated -> second; } else { // Otherwise, assign the "X" or "unknown" amino acid - aa += "X"; + residue = "X"; } + } else if (codon.length() == 1){ + // Then we only have one nucleotides and the amino acid is unknown + residue = "X"; + } else { + std::string msg = "Codon is invalid length: "; + msg += codon; + throw minhash_exception(msg); + } + return residue; + } + + std::string _dna_to_aa(const std::string& dna) { + std::string aa; + std::string codon; + std::string residue; + unsigned int dna_size = (dna.size() / 3) * 3; // floor it + for (unsigned int j = 0; j < dna_size; j += 3) { + + codon = dna.substr(j, 3); + + residue = translate_codon(codon); + + // Use dayhoff encoding of amino acids + if (dayhoff) { + std::string new_letter = aa_to_dayhoff(residue); + aa += new_letter; + } else { + aa += residue; + } + } return aa; } @@ -226,6 +266,22 @@ public: return out; } + std::string aa_to_dayhoff(const std::string& aa) const { + // Convert an amino acid letter to dayhoff encoding + std::string new_letter; + + auto dayhoff_encoded = _dayhoff_table.find(aa); + if (dayhoff_encoded != _dayhoff_table.end()) { + // "second" is the element mapped to by the codon + // Because .find returns an iterator + new_letter = dayhoff_encoded -> second; + } else { + // Otherwise, assign the "X" or "unknown" amino acid + new_letter = "X"; + } + return new_letter; + } + virtual void merge(const KmerMinHash& other) { check_compatible(other); @@ -263,7 +319,7 @@ private: {"TTT", "F"}, {"TTC", "F"}, {"TTA", "L"}, {"TTG", "L"}, - {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"}, + {"TCT", "S"}, {"TCC", "S"}, {"TCA", "S"}, {"TCG", "S"}, {"TCN", "S"}, {"TAT", "Y"}, {"TAC", "Y"}, {"TAA", "*"}, {"TAG", "*"}, @@ -272,19 +328,19 @@ private: {"TGA", "*"}, {"TGG", "W"}, - {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"}, + {"CTT", "L"}, {"CTC", "L"}, {"CTA", "L"}, {"CTG", "L"}, {"CTN", "L"}, - {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"}, + {"CCT", "P"}, {"CCC", "P"}, {"CCA", "P"}, {"CCG", "P"}, {"CCN", "P"}, {"CAT", "H"}, {"CAC", "H"}, {"CAA", "Q"}, {"CAG", "Q"}, - {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"}, + {"CGT", "R"}, {"CGC", "R"}, {"CGA", "R"}, {"CGG", "R"}, {"CGN", "R"}, {"ATT", "I"}, {"ATC", "I"}, {"ATA", "I"}, {"ATG", "M"}, - {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"}, + {"ACT", "T"}, {"ACC", "T"}, {"ACA", "T"}, {"ACG", "T"}, {"ACN", "T"}, {"AAT", "N"}, {"AAC", "N"}, {"AAA", "K"}, {"AAG", "K"}, @@ -292,24 +348,62 @@ private: {"AGT", "S"}, {"AGC", "S"}, {"AGA", "R"}, {"AGG", "R"}, - {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"}, + {"GTT", "V"}, {"GTC", "V"}, {"GTA", "V"}, {"GTG", "V"}, {"GTN", "V"}, - {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"}, + {"GCT", "A"}, {"GCC", "A"}, {"GCA", "A"}, {"GCG", "A"}, {"GCN", "A"}, {"GAT", "D"}, {"GAC", "D"}, {"GAA", "E"}, {"GAG", "E"}, - {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"} + {"GGT", "G"}, {"GGC", "G"}, {"GGA", "G"}, {"GGG", "G"}, {"GGN", "G"} + }; + + +// Dayhoff table from +// Peris, P., López, D., & Campos, M. (2008). +// IgTM: An algorithm to predict transmembrane domains and topology in +// proteins. BMC Bioinformatics, 9(1), 1029–11. +// http://doi.org/10.1186/1471-2105-9-367 +// +// Original source: +// Dayhoff M. O., Schwartz R. M., Orcutt B. C. (1978). +// A model of evolutionary change in proteins, +// in Atlas of Protein Sequence and Structure, +// ed Dayhoff M. O., editor. +// (Washington, DC: National Biomedical Research Foundation; ), 345–352. +// +// | Amino acid | Property | Dayhoff | +// |---------------|-----------------------|---------| +// | C | Sulfur polymerization | a | +// | A, G, P, S, T | Small | b | +// | D, E, N, Q | Acid and amide | c | +// | H, K, R | Basic | d | +// | I, L, M, V | Hydrophobic | e | +// | F, W, Y | Aromatic | f | + std::map _dayhoff_table = { + {"C", "a"}, + + {"A", "b"}, {"G", "b"}, {"P", "b"}, {"S", "b"}, {"T", "b"}, + + {"D", "c"}, {"E", "c"}, {"N", "c"}, {"Q", "c"}, + + {"H", "d"}, {"K", "d"}, {"R", "d"}, + + {"I", "e"}, {"L", "e"}, {"M", "e"}, {"V", "e"}, + + {"F", "f"}, {"W", "f"}, {"Y", "f"} + }; + }; class KmerMinAbundance: public KmerMinHash { public: CMinHashType abunds; - KmerMinAbundance(unsigned int n, unsigned int k, bool prot, uint32_t seed, - HashIntoType mx) : - KmerMinHash(n, k, prot, seed, mx) { }; + KmerMinAbundance(unsigned int n, unsigned int k, bool prot, bool dayhoff, + uint32_t seed, HashIntoType mx) : + KmerMinHash(n, k, prot, dayhoff, seed, mx) { }; virtual void add_hash(HashIntoType h) { if ((max_hash and h <= max_hash) or not max_hash) { @@ -415,7 +509,7 @@ class KmerMinAbundance: public KmerMinHash { std::copy(it2_m, other.mins.end(), out_m); std::copy(it2_a, other.abunds.end(), out_a); - if (merged_mins.size() < num) { + if (merged_mins.size() < num || !num) { mins = merged_mins; abunds = merged_abunds; } else { diff --git a/sourmash/np_utils.py b/sourmash/np_utils.py new file mode 100644 index 0000000000..a06ffd4e0a --- /dev/null +++ b/sourmash/np_utils.py @@ -0,0 +1,20 @@ +import numpy as np +import tempfile + + +def to_memmap(array): + """Write a memory mapped array + Create a memory-map to an array stored in a binary file on disk. + Memory-mapped files are used for accessing small segments of + large files on disk, without reading the entire file into memory. + :param np.array array to memory map + :return: np.array large_memmap memory mapped array + :return: str filename name of the file that memory mapped array is written to + """ + filename = tempfile.NamedTemporaryFile(prefix="array", suffix=".mmap", delete=False).name + shape = array.shape + f = np.memmap(filename, mode='w+', shape=shape, dtype=array.dtype) + f[:] = array[:] + del f + large_memmap = np.memmap(filename, dtype=array.dtype, shape=shape) + return large_memmap, filename diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index e08761387b..295dcc3477 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -46,7 +46,8 @@ def _check_abundance_compatibility(sig1, sig2): def _flatten(mh): "turn off track_abundance on a MinHash object" mh_params = list(mh.__getstate__()) - mh_params[5] = False + # Abundance is 6th parameter + mh_params[6] = False mh.__setstate__(mh_params) assert not mh.track_abundance @@ -54,8 +55,10 @@ def _flatten(mh): def _set_num_scaled(mh, num, scaled): "set num and scaled values on a MinHash object" mh_params = list(mh.__getstate__()) + # Number of hashes is 0th parameter mh_params[0] = num - mh_params[6] = get_max_hash_for_scaled(scaled) + # Scale is 7th parameter + mh_params[7] = get_max_hash_for_scaled(scaled) mh.__setstate__(mh_params) assert mh.num == num assert mh.scaled == scaled @@ -111,7 +114,10 @@ def describe(args): ksize = mh.ksize moltype = 'DNA' if mh.is_protein: - moltype = 'protein' + if mh.dayhoff: + moltype = 'dayhoff' + else: + moltype = 'protein' scaled = mh.scaled num = mh.num seed = mh.seed diff --git a/sourmash/signature.py b/sourmash/signature.py index d4fd4ab3fe..53d23d402b 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -99,8 +99,10 @@ def _save(self): sketch['mins'] = list(map(int, minhash.get_mins())) sketch['md5sum'] = self.md5sum() - if minhash.is_protein: + if minhash.is_protein and not minhash.dayhoff: sketch['molecule'] = 'protein' + elif minhash.dayhoff: + sketch['molecule'] = 'dayhoff' else: sketch['molecule'] = 'DNA' diff --git a/sourmash/signature_json.py b/sourmash/signature_json.py index b7c53915fe..143f5a02de 100644 --- a/sourmash/signature_json.py +++ b/sourmash/signature_json.py @@ -6,10 +6,9 @@ # This was written for Python 3, may be there is a chance it will work with Python 2... from __future__ import print_function, unicode_literals -import sys - import io import json +import time try: import ijson.backends.yajl2 as ijson except ImportError: @@ -82,16 +81,23 @@ def _json_next_signature(iterable, molecule = d.get('molecule', 'DNA') if molecule == 'protein': is_protein = True + dayhoff = False + elif molecule == "dayhoff": + is_protein = True + dayhoff = True elif molecule.upper() == 'DNA': is_protein = False + dayhoff = False else: raise Exception("unknown molecule type: {}".format(molecule)) + track_abundance = False if 'abundances' in d: track_abundance = True e = MinHash(ksize=ksize, n=n, is_protein=is_protein, + dayhoff=dayhoff, track_abundance=track_abundance, max_hash=max_hash, seed=seed) @@ -229,16 +235,15 @@ def load_signatures_json(data, ksize=None, ignore_md5sum=True, ijson=ijson): notify('\r...sig loading {:,}', n, flush=True) -def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True): - """ Save multiple signatures into a JSON string (or into file handle 'fp') +def add_meta_save(siglist): + """ Convert one signature into a JSON dict - siglist: sequence of SourmashSignature objects - - fp: - - indent: indentation spaces (an integer) or if None no indentation - - sort_keys: sort the keys in mappings before writting to JSON + - index: index of siglist to save """ from .signature import SIGNATURE_VERSION - + records = [] top_records = {} + for sig in siglist: name, filename, sketch = sig._save() k = (name, filename) @@ -247,9 +252,8 @@ def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True): top_records[k] = x if not top_records: - return "" + return records - records = [] for (name, filename), sketches in top_records.items(): record = {} if name: @@ -263,9 +267,12 @@ def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True): record['hash_function'] = '0.murmur64' record['license'] = 'CC0' record['email'] = '' - records.append(record) + return records + + +def write_records_to_json(records, fp=None, indent=None, sort_keys=True): s = json.dumps(records, indent=indent, sort_keys=sort_keys, separators=(str(','), str(':'))) if fp: try: @@ -273,5 +280,21 @@ def save_signatures_json(siglist, fp=None, indent=None, sort_keys=True): except TypeError: fp.write(unicode(s)) return None + return s + +def save_signatures_json( + siglist, fp=None, indent=None, sort_keys=True): + """ Save multiple signatures into a JSON string (or into file handle 'fp') + - siglist: sequence of SourmashSignature objects + - fp: file handle to the location of a sig file + - indent: indentation spaces (an integer) or if None no indentation + - sort_keys: sort the keys in mappings before writting to JSON + """ + startt = time.time() + records = add_meta_save(siglist) + if records == []: + return "" + s = write_records_to_json(records, fp, indent, sort_keys) + notify("time taken to save signatures is {:.5f} seconds", time.time() - startt, end="\r") return s diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 214f6086f7..93562002ce 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -53,6 +53,13 @@ def add_moltype_args(parser): help='do not choose a protein signature') parser.set_defaults(protein=False) + parser.add_argument('--dayhoff', dest='dayhoff', action='store_true', + help='build Dayhoff-encoded amino acid signatures (default: False)') + parser.add_argument('--no-dayhoff', dest='dayhoff', + action='store_false', + help='do not build Dayhoff-encoded amino acid signatures') + parser.set_defaults(dayhoff=False) + parser.add_argument('--dna', '--rna', dest='dna', default=None, action='store_true', help='choose a nucleotide signature (default: True)') @@ -70,6 +77,13 @@ def add_construct_moltype_args(parser): help='do not build protein signatures') parser.set_defaults(protein=False) + parser.add_argument('--dayhoff', dest='dayhoff', action='store_true', + help='build Dayhoff-encoded amino acid signatures (default: False)') + parser.add_argument('--no-dayhoff', dest='dayhoff', + action='store_false', + help='do not build Dayhoff-encoded amino acid signatures') + parser.set_defaults(dayhoff=False) + parser.add_argument('--dna', '--rna', dest='dna', default=None, action='store_true', help='build nucleotide signatures (default: True)') @@ -89,6 +103,8 @@ def get_moltype(sig, require=False): moltype = 'DNA' elif sig.minhash.is_molecule_type('protein'): moltype = 'protein' + elif sig.minhash.is_molecule_type('dayhoff'): + moltype = 'dayhoff' else: raise ValueError('unknown molecule type for sig {}'.format(sig.name())) @@ -107,6 +123,8 @@ def calculate_moltype(args, default=None): moltype = 'protein' elif args.dna: moltype = 'DNA' + elif args.dayhoff: + moltype = 'dayhoff' return moltype diff --git a/sourmash/tenx.py b/sourmash/tenx.py index 32fecf0d4e..8dd3596899 100644 --- a/sourmash/tenx.py +++ b/sourmash/tenx.py @@ -1,20 +1,268 @@ +""" +10x-sequencing specific utility functions. +""" + import os +from .logging import notify +from collections import defaultdict +import tempfile +import time +import numpy as np + +CELL_BARCODES = ['CB', 'XC'] +UMIS = ['UB', 'XM'] + + +def pass_alignment_qc(alignment, barcodes): + """ + Check high quality mapping, QC-passing barcode and UMI of alignment. + + alignment : + aligned bam segment + barcodes : list + List of cellular barcode strings + Returns + ------- + pass_qc : boolean + true if a high quality, QC passing barcode with a UMI, false otherwise + """ + high_quality_mapping = alignment.mapq == 255 + if barcodes is not None: + good_cell_barcode = any( + [alignment.has_tag(cb) and alignment.get_tag(cb) in barcodes + for cb in CELL_BARCODES]) + else: + good_cell_barcode = any([alignment.has_tag(cb) for cb in CELL_BARCODES]) + good_molecular_barcode = any([alignment.has_tag(umi) for umi in UMIS]) + not_duplicate = not alignment.is_duplicate + + pass_qc = high_quality_mapping and good_cell_barcode and \ + good_molecular_barcode and not_duplicate + return pass_qc + + +def parse_barcode_renamer(barcodes, barcode_renamer): + """ + Return a dictionary with cell barcode and the renamed barcode. + + barcodes : list + List of cellular barcode strings + barcode_renamer : str + Path to tab-separated file mapping barcodes to their new name + e.g. with channel or cell annotation label, + e.g. AAATGCCCAAACTGCT-1 lung_epithelial_cell|AAATGCCCAAACTGCT-1 + Returns + ------- + barcode_renamer : dict + A (str, str) mapping of the original barcode to its new name + """ + if barcode_renamer is not None: + renamer = {} + + with open(barcode_renamer) as f: + for line in f.readlines(): + barcode, renamed = line.split() + assert barcode in barcodes + renamer[barcode] = renamed + else: + renamer = dict(zip(barcodes, barcodes)) + return renamer + + +def read_barcodes_file(barcode_path): + """Read single-column barcodes.tsv and genes.tsv files from 10x. + + Parameters + ---------- + barcode_path : str + Name of a 10x 'barcodes.tsv' files + Returns + ------- + barcodes : list + List of QC-passing barcodes from 'barcodes.tsv' + """ + with open(barcode_path) as f: + barcodes = np.unique([line.strip() for line in f]) + return barcodes + + +def read_bam_file(bam_path): + """Read from a QC-pass bam file. + + Parameters + ---------- + bam_path : str + Name of a 10x cellranger bam file + 'possorted_genome_bam.bam' + Returns + ------- + bam_file : pysam.AlignmentFile + Iterator over possorted_genome_bam.bam file + """ + import pysam + + return pysam.AlignmentFile(bam_path, mode='rb') + + +def shard_bam_file(bam_file_path, chunked_file_line_count, shards_folder): + """Shard QC-pass bam file with the given line count and save to shards_folder + + Parameters + ---------- + bam_file_path : str + Bam file to shard + chunked_file_line_count: int + number of lines/alignment reads in each sharded bam file + shards_folder: str + absolute path to save the sharded bam files to + Returns + ------- + + shards : list + list of sharded bam filenames + """ + import pysam + + notify("Sharding the bam file") + startt = time.time() + file_names = [] + + with read_bam_file(bam_file_path) as bam_file: + line_count = 0 + file_count = 0 + header = bam_file.header + for index, alignment in enumerate(bam_file): + if line_count == 0: + file_name = os.path.join( + shards_folder, + "temp_bam_shard_{}.bam".format(file_count)) + file_names.append(file_name) + outf = pysam.AlignmentFile(file_name, "wb", header=header) + if line_count == chunked_file_line_count: + file_count = file_count + 1 + line_count = 0 + outf.write(alignment) + outf.close() + notify("===== Sharding bam file ====== {}", file_count, + end="\r") + else: + outf.write(alignment) + line_count = line_count + 1 + outf.close() + + notify("time taken to shard the bam file into {} shards is {:.5f} seconds", + file_count, time.time() - startt) + return file_names + + +def bam_to_fasta(barcodes, barcode_renamer, delimiter, umi_filter, bam_file): + """Convert 10x bam to one-record-per-cell fasta. + + Parameters + ---------- + bam : bamnostic.AlignmentFile + barcodes : list of str + QC-passing barcodes + barcode_renamer : str or None + Tab-separated filename mapping a barcode to a new name, e.g. + AAATGCCCAAACTGCT-1 lung_epithelial_cell|AAATGCCCAAACTGCT-1 + delimiter : str + Non-DNA or protein alphabet character to be ignored, e.g. if a cell + has two sequences 'AAAAAAAAA' and 'CCCCCCCC', they would be + concatenated as 'AAAAAAAAAXCCCCCCCC'. + umi_filter : boolean + If umi filter is True, then umi is written in place of fasta + record name + barcode is the fasta file name in the output fastas. + Returns + ------- + filenames: list + one temp fasta filename for one cell's high-quality, non-duplicate + reads + + """ + bam = read_bam_file(bam_file) + + # Filter out high quality alignments and/or alignments with selected + # barcoddes + bam_filtered = (x for x in bam if pass_alignment_qc(x, barcodes)) + if barcode_renamer is not None and barcodes is not None: + renamer = parse_barcode_renamer(barcodes, barcode_renamer) + else: + renamer = None + cell_sequences = defaultdict(str) + + for alignment in bam_filtered: + # Get barcode of alignment, looks like "AAATGCCCAAACTGCT-1" + # a bam file might have good cell barcode as any of the tags in + # CELL_BARCODES + for cb in CELL_BARCODES: + if alignment.has_tag(cb): + barcode = alignment.get_tag(cb) + + # If umi_filter is True, add the umi to the cell barcode separated by X + # "AAATGCCCAAACTGCT-1" is the CELL_BARCODE and AGCT is the UMI + # then the key in cell_sequences would be AAATGCCCAAACTGCT-1XAGCT + renamed = renamer[barcode] if renamer is not None else barcode + if umi_filter: + for umi_tag in UMIS: + if alignment.has_tag(umi_tag): + umi = alignment.get_tag(umi_tag) + renamed = renamed + delimiter + umi + + # Make a long string of all the cell sequences, separated + # by a non-alphabet letter + cell_sequences[renamed] += alignment.seq + delimiter + + filenames = list(set(write_cell_sequences(cell_sequences, umi_filter, delimiter))) + notify("bam_to_fasta conversion completed on {}", bam_file, end='\r', flush=True) + bam.close() -def read_single_column(filename): - """Read single-column barcodes.tsv and genes.tsv files from 10x""" - with open(filename) as f: - lines = set(line.strip() for line in f) - return lines + return filenames -def read_10x_folder(folder): - """Get QC-pass barcodes, genes, and bam file from a 10x folder""" - import bamnostic as bs +def write_cell_sequences(cell_sequences, umi_filter=False, delimiter="X"): + """ + Write each cell's sequences to an individual file + Parameters + ---------- + cell_sequences: dictionary with a cell and corresponding sequence + if umi_filter is True, the cell key is expected to contain umi as well + separated by the delimiter. + e.g cell_key dictionary if umi_filter False is {AAAAAAAAA: AGCTACACTA}, + else {AAAAAAAAAXACTAG: AGCTACACTA} - In this case AAAAAAAAA would be cell + barcode and ACTAG would be umi. The umi will be further used by downstream + processing functions appropriately. The barcode is safely returned as the + fasta filename and the umi is saved as record.name/sequence id in the + fasta file + umi_filter : boolean, default False + If umi filter is True, then umi is written in the fasta file as + record.name, cell barcode is the fasta file name in the output fastas. + delimiter : str, default X + Used to separate barcode and umi in the cell sequences dict. - barcodes = read_single_column(os.path.join(folder, 'barcodes.tsv')) + Returns + ------- + filenames: generator + one temp fasta filename for one cell/cell_umi with sequence + """ + temp_folder = tempfile.mkdtemp() - bam_file = bs.AlignmentFile( - os.path.join(folder, 'possorted_genome_bam.bam'), mode='rb') + for cell, seq in cell_sequences.items(): + if umi_filter: + barcode, umi = cell.split(delimiter) + filename = os.path.join(temp_folder, barcode + '.fasta') - return barcodes, bam_file + # Append to an existing barcode file with a different umi + with open(filename, "a") as f: + if os.stat(filename).st_size != 0: + f.write(">{}\n{}\n".format(umi, seq)) + else: + f.write(">{}\n{}".format(umi, seq)) + yield filename + else: + filename = os.path.join(temp_folder, cell + '.fasta') + with open(filename, "w") as f: + f.write(">{}\n{}".format(cell, seq)) + yield filename diff --git a/src/lib.rs b/src/lib.rs index 63ce3c1b2b..e9c1b47400 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,8 +29,6 @@ pub mod sketch; #[cfg(feature = "from-finch")] pub mod from; -pub mod cmd; - use cfg_if::cfg_if; use murmurhash3::murmurhash3_x64_128; @@ -39,6 +37,8 @@ cfg_if! { pub mod wasm; } else { pub mod ffi; + + pub mod cmd; } } diff --git a/src/sketch/minhash.rs b/src/sketch/minhash.rs index 059e8a17d2..e95ab3fdc5 100644 --- a/src/sketch/minhash.rs +++ b/src/sketch/minhash.rs @@ -497,7 +497,7 @@ impl SigsTrait for KmerMinHash { for kmer in sequence.windows(self.ksize as usize) { if _checkdna(kmer) { let rc = revcomp(kmer); - if kmer < &rc { + if kmer < rc.as_slice() { self.add_word(kmer); } else { self.add_word(&rc); diff --git a/src/wasm.rs b/src/wasm.rs index a328e64a4e..3bd88e54af 100644 --- a/src/wasm.rs +++ b/src/wasm.rs @@ -2,7 +2,8 @@ use wasm_bindgen::prelude::*; use serde_json; -use crate::KmerMinHash; +use crate::signature::SigsTrait; +use crate::sketch::minhash::KmerMinHash; #[wasm_bindgen] impl KmerMinHash { diff --git a/tests/conftest.py b/tests/conftest.py index 37ac98a848..28d42ac5c5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,35 @@ def track_abundance(request): return request.param +@pytest.fixture(params=[True, False]) +def dayhoff(request): + return request.param + + @pytest.fixture(params=[2, 5, 10]) def n_children(request): return request.param + + +# --- BEGIN - Only run tests using a particular fixture --- # +# Cribbed from: http://pythontesting.net/framework/pytest/pytest-run-tests-using-particular-fixture/ +def pytest_collection_modifyitems(items, config): + fixture_name = config.option.usesfixture + if fixture_name is not None: + selected_items = [] + deselected_items = [] + + for item in items: + if fixture_name in getattr(item, 'fixturenames', ()): + selected_items.append(item) + else: + deselected_items.append(item) + config.hook.pytest_deselected(items=deselected_items) + items[:] = selected_items + +def pytest_addoption(parser): + parser.addoption("--usesfixture", + action="store", + default=None, + help="just run tests that use a particular fixture") +# --- END - Only run tests using a particular fixture --- # diff --git a/tests/test-data/10x-example/barcodes_renamer.tsv b/tests/test-data/10x-example/barcodes_renamer.tsv new file mode 100644 index 0000000000..2bf405423e --- /dev/null +++ b/tests/test-data/10x-example/barcodes_renamer.tsv @@ -0,0 +1,10 @@ +AAACGGGTCTCGTATT-1 lung_epithelial_cell|AAACGGGTCTCGTATT-1 +AAACGGGAGGATATAC-1 human_epithelial_cell|AAACGGGAGGATATAC-1 +AAAGATGCAGATCTGT-1 mouse_epithelial_cell|AAAGATGCAGATCTGT-1 +AAATGCCAGATAGTCA-1 lung_epithelial_cell|AAATGCCAGATAGTCA-1 +AAATGCCCAAACTGCT-1 lung_epithelial_cell|AAATGCCCAAACTGCT-1 +AAATGCCGTGAACCTT-1 lung_epithelial_cell|AAATGCCGTGAACCTT-1 +AACACGTAGTGTACCT-1 lung_epithelial_cell|AACACGTAGTGTACCT-1 +AACACGTGTGGCTCCA-1 lung_epithelial_cell|AACACGTGTGGCTCCA-1 +AACCATGAGTTGTCGT-1 lung_epithelial_cell|AACCATGAGTTGTCGT-1 +AACCATGCACGTCAGC-1 lung_epithelial_cell|AACCATGCACGTCAGC-1 diff --git a/tests/test-data/10x-example/possorted_genome_bam_filtered.bam b/tests/test-data/10x-example/possorted_genome_bam_filtered.bam new file mode 100644 index 0000000000..7266d5a511 Binary files /dev/null and b/tests/test-data/10x-example/possorted_genome_bam_filtered.bam differ diff --git a/tests/test__minhash.py b/tests/test__minhash.py index e3d593e0df..4508f9440b 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -107,9 +107,9 @@ def test_bytes_dna(track_abundance): assert len(b) == 1 -def test_bytes_protein(track_abundance): +def test_bytes_protein(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 6, True, track_abundance=track_abundance) + mh = MinHash(10, 6, True, dayhoff=dayhoff, track_abundance=track_abundance) mh.add_protein('AGYYG') mh.add_protein(u'AGYYG') mh.add_protein(b'AGYYG') @@ -117,14 +117,41 @@ def test_bytes_protein(track_abundance): assert len(mh.get_mins()) == 4 -def test_protein(track_abundance): +def test_protein(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 6, True, track_abundance=track_abundance) + mh = MinHash(10, 6, True, dayhoff=dayhoff, track_abundance=track_abundance) mh.add_protein('AGYYG') assert len(mh.get_mins()) == 4 +def test_translate_codon(track_abundance): + # Ensure that translation occurs properly + mh = MinHash(10, 6, is_protein=True) + assert "S" == mh.translate_codon('TCT') + assert "S" == mh.translate_codon('TC') + assert "X" == mh.translate_codon("T") + + with pytest.raises(ValueError): + mh.translate_codon("") + mh.translate_codon("TCTA") + + +def test_dayhoff(track_abundance): + # verify that we can hash to dayhoff-encoded protein/aa sequences + mh_dayhoff = MinHash(10, 6, is_protein=True, + dayhoff=True, track_abundance=track_abundance) + mh_dayhoff.add_sequence('ACTGAC') + + assert len(mh_dayhoff.get_mins()) == 2 + # verify that dayhoff-encoded hashes are different from protein/aa hashes + mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) + mh_protein.add_sequence('ACTGAC') + + assert len(mh_protein.get_mins()) == 2 + assert mh_protein.get_mins() != mh_dayhoff.get_mins() + + def test_protein_short(track_abundance): # verify that we can hash protein/aa sequences mh = MinHash(10, 9, True, track_abundance=track_abundance) @@ -483,6 +510,42 @@ def test_mh_merge(track_abundance): assert d.compare(c) == 1.0 +def test_mh_merge_empty_num(track_abundance): + # test merging two identically configured minhashes, one empty + a = MinHash(20, 10, track_abundance=track_abundance) + + b = MinHash(20, 10, track_abundance=track_abundance) + for i in range(0, 80, 4): + b.add_hash(i) + + c = a.merge(b) + d = b.merge(a) + + assert len(c) + assert len(c) == len(d) + assert c.get_mins() == d.get_mins() + assert c.compare(d) == 1.0 + assert d.compare(c) == 1.0 + + +def test_mh_merge_empty_scaled(track_abundance): + # test merging two identically configured minhashes, one empty + a = MinHash(0, 10, scaled=1, track_abundance=track_abundance) + + b = MinHash(0, 10, scaled=1, track_abundance=track_abundance) + for i in range(0, 80, 4): + b.add_hash(i) + + c = a.merge(b) + d = b.merge(a) + + assert len(c) + assert len(c) == len(d) + assert c.get_mins() == d.get_mins() + assert c.compare(d) == 1.0 + assert d.compare(c) == 1.0 + + def test_mh_merge_check_length(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) for i in range(0, 40, 2): diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 57b9adad4f..16f3a8ca30 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -4,6 +4,7 @@ from __future__ import print_function, unicode_literals import csv import shutil +import os import pytest @@ -623,6 +624,85 @@ def test_sig_describe_1(c): for line in expected_output: assert line.strip() in out +@utils.in_tempdir +def test_sig_describe_1_dayhoff(c): + # get basic info on a signature + testdata = utils.get_test_data('short.fa') + c.run_sourmash('compute', '-k', '21,30', + '--dayhoff', '--protein', + '--dna', + testdata) + # stdout should be new signature + computed_sig = os.path.join(c.location, 'short.fa.sig') + c.run_sourmash('sig', 'describe', computed_sig) + + out = c.last_result.out + print(c.last_result) + + # Add final trailing slash for this OS + testdata_dirname = os.path.dirname(testdata) + os.sep + location = c.location + os.sep + + expected_output = """\ +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: e45a080101751e044d6df861d3d0f3fd +k=21 molecule=protein num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: ef4fa1f3a90f3873187370f1eacc0d9a +k=21 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 1136a8a68420bd93683e45cdaf109b80 +k=21 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 4244d1612598af044e799587132f007e +k=30 molecule=protein num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 5647819f2eac913e04af51c8d548ad56 +k=30 molecule=dayhoff num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 + +--- +signature filename: short.fa.sig +signature: short.fa +source file: short.fa +md5: 71f7c111c01785e5f38efad45b00a0e1 +k=30 molecule=DNA num=500 scaled=0 seed=42 track_abundance=0 +size: 500 +signature license: CC0 +""".splitlines() + for line in out.splitlines(): + cleaned_line = line.strip().replace( + testdata_dirname, '').replace(location, '') + assert cleaned_line in expected_output + @utils.in_tempdir def test_sig_describe_1_multisig(c): diff --git a/tests/test_compare.py b/tests/test_compare.py new file mode 100644 index 0000000000..9555993c6b --- /dev/null +++ b/tests/test_compare.py @@ -0,0 +1,60 @@ +import glob +import os + +import numpy as np +import pytest + +import sourmash +from sourmash.compare import compare_all_pairs, compare_parallel, compare_serial +from . import sourmash_tst_utils as utils + + +@pytest.fixture() +def siglist(): + demo_path = utils.get_test_data("demo") + filenames = sorted(glob.glob(os.path.join(demo_path, "*.sig"))) + sigs = [] + for filename in filenames: + sigs.extend(sourmash.load_signatures(filename)) + return sigs + + +@pytest.fixture() +def ignore_abundance(track_abundance): + return not track_abundance + + +def test_compare_serial(siglist, ignore_abundance): + similarities = compare_serial(siglist, ignore_abundance, downsample=False) + + true_similarities = np.array( + [[1., 0.356, 0.078, 0.086, 0., 0., 0.], + [0.356, 1., 0.072, 0.078, 0., 0., 0.], + [0.078, 0.072, 1., 0.074, 0., 0., 0.], + [0.086, 0.078, 0.074, 1., 0., 0., 0.], + [0., 0., 0., 0., 1., 0.382, 0.364], + [0., 0., 0., 0., 0.382, 1., 0.386], + [0., 0., 0., 0., 0.364, 0.386, 1.]]) + + np.testing.assert_array_equal(similarities, true_similarities) + + +def test_compare_parallel(siglist, ignore_abundance): + similarities = compare_parallel(siglist, ignore_abundance, downsample=False, n_jobs=2) + + true_similarities = np.array( + [[1., 0.356, 0.078, 0.086, 0., 0., 0.], + [0.356, 1., 0.072, 0.078, 0., 0., 0.], + [0.078, 0.072, 1., 0.074, 0., 0., 0.], + [0.086, 0.078, 0.074, 1., 0., 0., 0.], + [0., 0., 0., 0., 1., 0.382, 0.364], + [0., 0., 0., 0., 0.382, 1., 0.386], + [0., 0., 0., 0., 0.364, 0.386, 1.]]) + + np.testing.assert_array_equal(similarities, true_similarities) + + +def test_compare_all_pairs(siglist, ignore_abundance): + similarities_parallel = compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=2) + similarities_serial = compare_serial(siglist, ignore_abundance, downsample=False) + np.testing.assert_array_equal(similarities_parallel, similarities_serial) diff --git a/tests/test_np_utils.py b/tests/test_np_utils.py new file mode 100644 index 0000000000..50aaa756f4 --- /dev/null +++ b/tests/test_np_utils.py @@ -0,0 +1,18 @@ +import numpy as np +from sourmash.signature import SourmashSignature +import sourmash +from sourmash.np_utils import to_memmap + + +def test_memmap(): + + e1 = sourmash.MinHash(n=1, ksize=20) + sig1 = SourmashSignature(e1) + + e2 = sourmash.MinHash(n=1, ksize=25) + sig2 = SourmashSignature(e2) + siglist = [sig1, sig2] + memmapped, filename = to_memmap(np.array(siglist)) + # Assert that the data didn't change as a result of memory-mapping + np.testing.assert_array_equal(memmapped, siglist) + assert filename.endswith(".mmap") diff --git a/tests/test_signature_json.py b/tests/test_signature_json.py index 7112d0963f..8a3e3823ad 100644 --- a/tests/test_signature_json.py +++ b/tests/test_signature_json.py @@ -11,7 +11,7 @@ load_signatureset_json_iter, save_signatures_json) from collections import OrderedDict - + def test__json_next_atomic_array(): t = (2,3,4,5,6) s = json.dumps(t) @@ -111,6 +111,40 @@ def test_load_signaturesset_json_iter(): assert len(sig_entries) == 2 +# integration test more than a unit test +def test_load_signaturesset_json_iter_molecules(): + + t = list() + molecules = 'DNA', 'protein', 'dayhoff' + names = "Foo", 'Bar', "Biz" + filenames = '/tmp/foo', '/tmp/bar', '/tmp/biz' + + for molecule, name, filename in zip(molecules, names, filenames): + minhash = (2,3,4,5,6) + t.append(OrderedDict(( + ('class', 'sourmash_signature'), + ('name', name), + ('filename', filename), + ('signatures', + ( + OrderedDict((('ksize', 21), + ('num', len(minhash)), + #('md5sum', ), + ('molecule', molecule), + ('cardinality', 123456), + ('mins', minhash))), + ))))) + + s = json.dumps(t) + if sys.version_info[0] < 3: + s = unicode(s) + # no MD5SUM + sig_entries = tuple(load_signatureset_json_iter(io.StringIO(s), + ignore_md5sum=True, + ijson=ijson)) + # Ensure all molecule types were read properly + assert len(sig_entries) == 3 + def test_save_load_multisig_json(): e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 69964abd4a..d7abdf7a5e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5,7 +5,6 @@ import os import gzip import shutil -import time import screed import glob import json @@ -26,6 +25,7 @@ from sourmash import signature from sourmash import VERSION + def test_run_sourmash(): status, out, err = utils.runscript('sourmash', [], fail_ok=True) assert status != 0 # no args provided, ok ;) @@ -56,644 +56,45 @@ def test_sourmash_info_verbose(): assert "loaded from path" in err -def test_do_sourmash_compute(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', testdata1], - in_directory=location) - - sigfile = os.path.join(location, 'short.fa.sig') - assert os.path.exists(sigfile) - - sig = next(signature.load_signatures(sigfile)) - assert sig.name().endswith('short.fa') - - -def test_do_sourmash_compute_output_valid_file(): - """ Trigger bug #123 """ - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(location, 'short.fa.sig') - - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '-o', sigfile, - testdata1, - testdata2, testdata3], - in_directory=location) - - assert os.path.exists(sigfile) - assert not out # stdout should be empty - - # is it valid json? - with open(sigfile, 'r') as f: - data = json.load(f) - - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) - - -def test_do_sourmash_compute_output_stdout_valid(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '-o', '-', - testdata1, - testdata2, testdata3], - in_directory=location) - - # is it valid json? - data = json.loads(out) - - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) - - -def test_do_sourmash_compute_output_and_name_valid_file(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(location, 'short.fa.sig') - - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '-o', sigfile, - '--merge', '"name"', - testdata1, - testdata2, testdata3], - in_directory=location) - - assert os.path.exists(sigfile) - assert 'calculated 1 signatures for 4 sequences taken from 3 files' in err - - # is it valid json? - with open(sigfile, 'r') as f: - data = json.load(f) - - assert len(data) == 1 - - all_testdata = " ".join([testdata1, testdata2, testdata3]) - sigfile_merged = os.path.join(location, 'short.all.fa.sig') - cmd = "cat {} | sourmash compute -k 31 -o {} -".format( - all_testdata, sigfile_merged) - status, out, err = utils.run_shell_cmd(cmd, in_directory=location) - - with open(sigfile_merged, 'r') as f: - data_merged = json.load(f) - - assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins'] - - -def test_do_sourmash_compute_singleton(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--singleton', - testdata1], - in_directory=location) - - sigfile = os.path.join(location, 'short.fa.sig') - assert os.path.exists(sigfile) - - sig = next(signature.load_signatures(sigfile)) - assert sig.name().endswith('shortName') - - -def test_do_sourmash_compute_10x(): - bamnostic = pytest.importorskip('bamnostic') - - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('10x-example') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', - '--input-is-10x', - testdata1], - in_directory=location) - - sigfile = os.path.join(location, '10x-example.sig') - assert os.path.exists(sigfile) - - with open(sigfile) as f: - data = json.load(f) - - barcode_signatures = [sig['name'] for sig in data] - - with open(utils.get_test_data('10x-example/barcodes.tsv')) as f: - true_barcodes = set(x.strip() for x in f.readlines()) - - # Ensure that every cell barcode in barcodes.tsv has a signature - assert all(bc in true_barcodes for bc in barcode_signatures) - - - -def test_do_sourmash_compute_name(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1, '-o', 'foo.sig'], - in_directory=location) - - sigfile = os.path.join(location, 'foo.sig') - assert os.path.exists(sigfile) - - sig = next(signature.load_signatures(sigfile)) - assert sig.name() == 'foo' - - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name', 'foo', - testdata1, '-o', 'foo2.sig'], - in_directory=location) - - sigfile2 = os.path.join(location, 'foo2.sig') - assert os.path.exists(sigfile) - - sig2 = next(signature.load_signatures(sigfile)) - assert sig2.name() == 'foo' - assert sig.name() == sig2.name() - - -def test_do_sourmash_compute_name_fail_no_output(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1], - in_directory=location, - fail_ok=True) - assert status == -1 - - -def test_do_sourmash_compute_merge_fail_no_output(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1], - in_directory=location, - fail_ok=True) - assert status == -1 - - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name', 'foo', - testdata1], - in_directory=location, - fail_ok=True) - assert status == -1 - - -def test_do_sourmash_compute_name_from_first(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short3.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name-from-first', - testdata1], - in_directory=location) - - sigfile = os.path.join(location, 'short3.fa.sig') - assert os.path.exists(sigfile) - - sig = next(signature.load_signatures(sigfile)) - assert sig.name() == 'firstname' - - -def test_do_sourmash_compute_multik(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') - assert os.path.exists(outfile) - - siglist = list(signature.load_signatures(outfile)) - assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) - assert 21 in ksizes - assert 31 in ksizes - - -def test_do_sourmash_compute_multik_with_protein(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') - assert os.path.exists(outfile) - - with open(outfile, 'rt') as fp: - sigdata = fp.read() - siglist = list(signature.load_signatures(sigdata)) - assert len(siglist) == 4 - ksizes = set([ x.minhash.ksize for x in siglist ]) - assert 21 in ksizes - assert 30 in ksizes - - -def test_do_sourmash_compute_multik_with_nothing(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--no-protein', '--no-dna', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short.fa.sig') - assert not os.path.exists(outfile) - - -def test_do_sourmash_compute_multik_protein_bad_ksize(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '20,32', - '--protein', '--no-dna', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short.fa.sig') - assert not os.path.exists(outfile) - assert 'protein ksizes must be divisible by 3' in err - - @utils.in_tempdir -def test_do_sourmash_compute_multik_only_protein(c): - # check sourmash compute with only protein, no nucl - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('compute', '-k', '21,30', - '--protein', '--no-dna', testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') - assert os.path.exists(outfile) - - with open(outfile, 'rt') as fp: - sigdata = fp.read() - siglist = list(signature.load_signatures(sigdata)) - assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) - assert 21 in ksizes - assert 30 in ksizes - - -def test_do_sourmash_compute_multik_protein_input_non_div3_ksize(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short-protein.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '20,32', - '--protein', '--no-dna', - '--input-is-protein', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short-protein.fa.sig') - assert os.path.exists(outfile) - - -@utils.in_tempdir -def test_do_sourmash_compute_multik_only_protein_no_rna(c): - # test --no-rna as well (otherwise identical to previous test) - testdata1 = utils.get_test_data('short.fa') - - c.run_sourmash('compute', '-k', '21,30', - '--protein', '--no-rna', testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') - assert os.path.exists(outfile) - - with open(outfile, 'rt') as fp: - sigdata = fp.read() - siglist = list(signature.load_signatures(sigdata)) - assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) - assert 21 in ksizes - assert 30 in ksizes - - -def test_do_sourmash_compute_protein_bad_sequences(): - """Proper error handling when Ns in dna sequence""" - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.bad.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--protein', '--no-dna', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.bad.fa.sig') - assert os.path.exists(outfile) - - with open(outfile, 'rt') as fp: - sigdata = fp.read() - siglist = list(signature.load_signatures(sigdata)) - assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) - assert 21 in ksizes - assert 30 in ksizes - - -def test_do_sourmash_compute_multik_input_is_protein(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--input-is-protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'ecoli.faa.sig') - assert os.path.exists(outfile) - - with open(outfile, 'rt') as fp: - sigdata = fp.read() - siglist = list(signature.load_signatures(sigdata)) - assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) - assert 21 in ksizes - assert 30 in ksizes - - moltype = set([ x.minhash.is_molecule_type('protein') - for x in siglist ]) - assert len(moltype) == 1 - assert True in moltype - - -def test_do_sourmash_compute_multik_outfile(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - testdata1, '-o', outfile], - in_directory=location) - assert os.path.exists(outfile) - - siglist = list(signature.load_signatures(outfile)) - assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) - assert 21 in ksizes - assert 31 in ksizes - - -def test_do_sourmash_compute_with_scaled_1(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1', - testdata1, '-o', outfile], - in_directory=location) - assert os.path.exists(outfile) - - siglist = list(signature.load_signatures(outfile)) - assert len(siglist) == 2 - - max_hashes = [ x.minhash.max_hash for x in siglist ] - assert len(max_hashes) == 2 - assert set(max_hashes) == { sourmash.MAX_HASH } - - -def test_do_sourmash_compute_with_scaled_2(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '2', - testdata1, '-o', outfile], - in_directory=location) - assert os.path.exists(outfile) - - siglist = list(signature.load_signatures(outfile)) - assert len(siglist) == 2 - - max_hashes = [ x.minhash.max_hash for x in siglist ] - assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /2.) ]) - - -def test_do_sourmash_compute_with_scaled(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '100', - testdata1, '-o', outfile], - in_directory=location) - assert os.path.exists(outfile) - - siglist = list(signature.load_signatures(outfile)) - assert len(siglist) == 2 - - max_hashes = [ x.minhash.max_hash for x in siglist ] - assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /100.) ]) - - -def test_do_sourmash_compute_with_bad_scaled(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '-1', - testdata1, '-o', outfile], - in_directory=location, - fail_ok=True) - - assert status != 0 - assert '--scaled value must be >= 1' in err - - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1000.5', - testdata1, '-o', outfile], - in_directory=location, - fail_ok=True) - - assert status != 0 - assert '--scaled value must be integer value' in err - - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1e9', - testdata1, '-o', outfile], - in_directory=location) - - assert status == 0 - assert 'WARNING: scaled value is nonsensical!?' in err - - -def test_do_sourmash_compute_with_seed(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--seed', '43', - testdata1, '-o', outfile], - in_directory=location) - assert os.path.exists(outfile) - - siglist = list(signature.load_signatures(outfile)) - assert len(siglist) == 2 - - seeds = [ x.minhash.seed for x in siglist ] - assert len(seeds) == 2 - assert set(seeds) == set([ 43 ]) - - -def test_do_sourmash_check_protein_comparisons(): - # this test checks 2 x 2 protein comparisons with E. coli genes. - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--input-is-protein', - '--singleton', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.faa.sig') - assert os.path.exists(sig1) - - testdata2 = utils.get_test_data('ecoli.genes.fna') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--protein', '--no-dna', - '--singleton', - testdata2], - in_directory=location) - sig2 = os.path.join(location, 'ecoli.genes.fna.sig') - assert os.path.exists(sig2) - - # I'm not sure why load_signatures is randomizing order, but ok. - x = list(signature.load_signatures(sig1)) - sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name()) - - x = list(signature.load_signatures(sig2)) - sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) - - name1 = sig1_aa.name().split()[0] - assert name1 == 'NP_414543.1' - name2 = sig2_aa.name().split()[0] - assert name2 == 'NP_414544.1' - name3 = sig1_trans.name().split()[0] - assert name3 == 'gi|556503834:2801-3733' - name4 = sig2_trans.name().split()[0] - assert name4 == 'gi|556503834:337-2799' - - print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) - print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) - print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3)) - print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3)) - - assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0 - assert round(sig2_aa.similarity(sig1_trans), 3) == 0.166 - assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174 - assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0 - - -@utils.in_tempdir -def test_do_sourmash_check_knowngood_dna_comparisons(c): - # this test checks against a known good signature calculated - # by utils/compute-dna-mh-another-way.py - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('compute', '-k', '21', - '--singleton', '--dna', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') - assert os.path.exists(sig1) - - x = list(signature.load_signatures(sig1)) - sig1, sig2 = sorted(x, key=lambda x: x.name()) - - knowngood = utils.get_test_data('benchmark.dna.sig') - good = list(signature.load_signatures(knowngood))[0] - - assert sig2.similarity(good) == 1.0 - - -@utils.in_tempdir -def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): - # check the --rna flag; otherwise identical to previous test. - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('compute', '-k', '21', '--singleton', '--rna', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') - assert os.path.exists(sig1) - - x = list(signature.load_signatures(sig1)) - sig1, sig2 = sorted(x, key=lambda x: x.name()) - - knowngood = utils.get_test_data('benchmark.dna.sig') - good = list(signature.load_signatures(knowngood))[0] - - assert sig2.similarity(good) == 1.0 - - -def test_do_sourmash_check_knowngood_input_protein_comparisons(): - # this test checks against a known good signature calculated - # by utils/compute-input-prot-another-way.py - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--input-is-protein', - '--singleton', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.faa.sig') - assert os.path.exists(sig1) - - x = list(signature.load_signatures(sig1)) - sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name()) - - knowngood = utils.get_test_data('benchmark.input_prot.sig') - good_aa = list(signature.load_signatures(knowngood))[0] - - assert sig1_aa.similarity(good_aa) == 1.0 +def test_do_serial_compare(c): + # try doing a compare serial + import numpy + testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = glob.glob(testsigs) + c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs) -def test_do_sourmash_check_knowngood_protein_comparisons(): - # this test checks against a known good signature calculated - # by utils/compute-prot-mh-another-way.py - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.genes.fna') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--singleton', '--protein', - '--no-dna', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.genes.fna.sig') - assert os.path.exists(sig1) + cmp_outfile = c.output('cmp') + assert os.path.exists(cmp_outfile) + cmp_out = numpy.load(cmp_outfile.encode('utf-8')) - x = list(signature.load_signatures(sig1)) - sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) + sigs = [] + for fn in testsigs: + sigs.append(sourmash.load_one_signature(fn, ksize=21, + select_moltype='dna')) - knowngood = utils.get_test_data('benchmark.prot.sig') - good_trans = list(signature.load_signatures(knowngood))[0] + cmp_calc = numpy.zeros([len(sigs), len(sigs)]) + for i, si in enumerate(sigs): + for j, sj in enumerate(sigs): + cmp_calc[i][j] = si.similarity(sj) - assert sig2_trans.similarity(good_trans) == 1.0 + sigs = [] + for fn in testsigs: + sigs.append(sourmash.load_one_signature(fn, ksize=21, + select_moltype='dna')) + assert (cmp_out == cmp_calc).all() @utils.in_tempdir -def test_do_basic_compare(c): - # try doing a basic compare +def test_do_compare_parallel(c): + # try doing a compare parallel import numpy testsigs = utils.get_test_data('genome-s1*.sig') testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs) + c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', + "--processes", "2", *testsigs) cmp_outfile = c.output('cmp') assert os.path.exists(cmp_outfile) @@ -762,7 +163,6 @@ def test_do_compare_quiet(): def test_do_traverse_directory_compare(): - import numpy with utils.TempDirectory() as location: status, out, err = utils.runscript('sourmash', ['compare', '--traverse-directory', @@ -866,6 +266,39 @@ def test_do_compare_output_multiple_moltype(): assert 'multiple molecule types loaded;' in err + +def test_do_compare_dayhoff(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', '--dayhoff', + '--no-dna', testdata1], + in_directory=location) + assert status == 0 + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', '--dayhoff', + '--no-dna', testdata2], + in_directory=location) + assert status == 0 + + status, out, err = utils.runscript('sourmash', + ['compare', 'short.fa.sig', + 'short2.fa.sig', '--dayhoff', + '--csv', 'xxx'], + in_directory=location) + true_out = '''[1. 0.94] +[0.94 1. ] +min similarity in matrix: 0.940'''.splitlines() + for line in out: + cleaned_line = line.split('...')[-1].strip() + cleaned_line in true_out + assert status == 0 + + + + def test_do_plot_comparison(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -930,6 +363,31 @@ def test_do_plot_comparison_3(): assert os.path.exists(os.path.join(location, "cmp.matrix.png")) +def test_do_plot_comparison_4_output_dir(): + with utils.TempDirectory() as location: + output_dir = os.path.join(location, 'xyz_test') + + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', testdata1, testdata2], + in_directory=location) + + + status, out, err = utils.runscript('sourmash', + ['compare', 'short.fa.sig', + 'short2.fa.sig', '-o', 'cmp'], + in_directory=location) + + status, out, err = utils.runscript('sourmash', + ['plot', 'cmp', '--labels', + '--output-dir', output_dir], + in_directory=location) + + assert os.path.exists(os.path.join(output_dir, "cmp.dendro.png")) + assert os.path.exists(os.path.join(output_dir, "cmp.matrix.png")) + + def test_do_plot_comparison_5_force(): import numpy with utils.TempDirectory() as location: @@ -3282,6 +2740,20 @@ def test_gather_abund_10_1_ignore_abundance(): assert 'genome-s12.fa.gz' not in out +@utils.in_tempdir +def test_gather_output_unassigned_with_abundance(c): + query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') + against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig') + + c.run_sourmash('gather', query, against, '--output-unassigned', + c.output('unassigned.sig')) + + assert os.path.exists(c.output('unassigned.sig')) + + ss = sourmash.load_one_signature(c.output('unassigned.sig')) + assert ss.minhash.track_abundance + + def test_sbt_categorize(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py new file mode 100644 index 0000000000..a3dea02bd0 --- /dev/null +++ b/tests/test_sourmash_compute.py @@ -0,0 +1,772 @@ +""" +Tests for sourmash compute command-line functionality. +""" +from __future__ import print_function, unicode_literals +import os +import gzip +import shutil +import screed +import glob +import json +import csv +import pytest + +from . import sourmash_tst_utils as utils +import sourmash +from sourmash import MinHash +from sourmash.sbt import SBT, Node +from sourmash.sbtmh import SigLeaf, load_sbt_index + +from sourmash import signature +from sourmash import VERSION + + +def test_do_sourmash_compute(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', testdata1], + in_directory=location) + + sigfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(sigfile) + + sig = next(signature.load_signatures(sigfile)) + assert sig.name().endswith('short.fa') + + +def test_do_sourmash_compute_output_valid_file(): + """ Trigger bug #123 """ + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + testdata3 = utils.get_test_data('short3.fa') + sigfile = os.path.join(location, 'short.fa.sig') + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '-o', sigfile, + testdata1, + testdata2, testdata3], + in_directory=location) + + assert os.path.exists(sigfile) + assert not out # stdout should be empty + + # is it valid json? + with open(sigfile, 'r') as f: + data = json.load(f) + + filesigs = [sig['filename'] for sig in data] + assert all(testdata in filesigs + for testdata in (testdata1, testdata2, testdata3)) + + +def test_do_sourmash_compute_output_stdout_valid(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + testdata3 = utils.get_test_data('short3.fa') + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '-o', '-', + testdata1, + testdata2, testdata3], + in_directory=location) + + # is it valid json? + data = json.loads(out) + + filesigs = [sig['filename'] for sig in data] + assert all(testdata in filesigs + for testdata in (testdata1, testdata2, testdata3)) + + +def test_do_sourmash_compute_output_and_name_valid_file(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + testdata3 = utils.get_test_data('short3.fa') + sigfile = os.path.join(location, 'short.fa.sig') + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '-o', sigfile, + '--merge', '"name"', + testdata1, + testdata2, testdata3], + in_directory=location) + + assert os.path.exists(sigfile) + assert 'calculated 1 signatures for 4 sequences taken from 3 files' in err + + # is it valid json? + with open(sigfile, 'r') as f: + data = json.load(f) + + assert len(data) == 1 + + all_testdata = " ".join([testdata1, testdata2, testdata3]) + sigfile_merged = os.path.join(location, 'short.all.fa.sig') + cmd = "cat {} | sourmash compute -k 31 -o {} -".format( + all_testdata, sigfile_merged) + status, out, err = utils.run_shell_cmd(cmd, in_directory=location) + + with open(sigfile_merged, 'r') as f: + data_merged = json.load(f) + + assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins'] + + +def test_do_sourmash_compute_singleton(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '--singleton', + testdata1], + in_directory=location) + + sigfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(sigfile) + + sig = next(signature.load_signatures(sigfile)) + assert sig.name().endswith('shortName') + + +def test_do_sourmash_compute_10x(): + pytest.importorskip('pysam') + pytest.importorskip('pathos') + + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') + barcodes_file = utils.get_test_data('10x-example/barcodes.tsv') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--line-count', '50', + '--input-is-10x', + '--protein', + '--barcodes-file', + barcodes_file, + testdata1], + in_directory=location) + + sigfile = os.path.join(location, 'possorted_genome_bam.bam.sig') + assert os.path.exists(sigfile) + siglist = list(signature.load_signatures(sigfile)) + assert len(siglist) == 16 + barcode_signatures = list(set([sig.name() for sig in siglist])) + + with open(utils.get_test_data('10x-example/barcodes.tsv')) as f: + true_barcodes = set(x.strip() for x in f.readlines()) + + # Ensure that every cell barcode in barcodes.tsv has a signature + assert all(bc in true_barcodes for bc in barcode_signatures) + min_hashes = [x.minhash.get_mins() for x in siglist] + assert all(mins != [] for mins in min_hashes) + + # Filtered bam file with no barcodes file + # should run sourmash compute successfully + testdata1 = utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', + '--dna', + '--input-is-10x', + testdata1, + '-o', '10x-example_dna.sig'], + in_directory=location) + + sigfile = os.path.join(location, '10x-example_dna.sig') + assert os.path.exists(sigfile) + siglist = list(signature.load_signatures(sigfile)) + assert len(siglist) == 32 + min_hashes = [x.minhash.get_mins() for x in siglist] + assert all(mins != [] for mins in min_hashes) + + testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam') + csv_path = os.path.join(location, "all_barcodes_meta.csv") + barcodes_path = utils.get_test_data('10x-example/barcodes.tsv') + renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', + '--dna', '--count-valid-reads', '10', + '--input-is-10x', + testdata1, + '--write-barcode-meta-csv', csv_path, + '--barcodes', barcodes_path, + '--rename-10x-barcodes', renamer_path, + '--save-fastas', location, + '-o', '10x-example_dna.sig'], + in_directory=location) + + sigfile = os.path.join(location, '10x-example_dna.sig') + assert os.path.exists(sigfile) + siglist = list(signature.load_signatures(sigfile)) + assert len(siglist) == 1 + min_hashes = [x.minhash.get_mins() for x in siglist] + assert all(mins != [] for mins in min_hashes) + + with open(csv_path, 'rb') as f: + data = [line.split() for line in f] + assert len(data) == 9 + barcodes = [filename.replace(".fasta", "") for filename in os.listdir(location) if filename.endswith('.fasta')] + assert len(barcodes) == 1 + assert barcodes[0] == 'lung_epithelial_cell|AAATGCCCAAACTGCT-1' + + +def test_do_sourmash_compute_name(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '--merge', 'foo', + testdata1, '-o', 'foo.sig'], + in_directory=location) + + sigfile = os.path.join(location, 'foo.sig') + assert os.path.exists(sigfile) + + sig = next(signature.load_signatures(sigfile)) + assert sig.name() == 'foo' + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '--name', 'foo', + testdata1, '-o', 'foo2.sig'], + in_directory=location) + + sigfile2 = os.path.join(location, 'foo2.sig') + assert os.path.exists(sigfile2) + + sig2 = next(signature.load_signatures(sigfile)) + assert sig2.name() == 'foo' + assert sig.name() == sig2.name() + + +def test_do_sourmash_compute_name_fail_no_output(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '--merge', 'foo', + testdata1], + in_directory=location, + fail_ok=True) + assert status == -1 + + +def test_do_sourmash_compute_merge_fail_no_output(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '--merge', 'foo', + testdata1], + in_directory=location, + fail_ok=True) + assert status == -1 + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '--name', 'foo', + testdata1], + in_directory=location, + fail_ok=True) + assert status == -1 + + +def test_do_sourmash_compute_name_from_first(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short3.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', '--name-from-first', + testdata1], + in_directory=location) + + sigfile = os.path.join(location, 'short3.fa.sig') + assert os.path.exists(sigfile) + + sig = next(signature.load_signatures(sigfile)) + assert sig.name() == 'firstname' + + +def test_do_sourmash_compute_multik(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 31 in ksizes + + +def test_do_sourmash_compute_multik_with_protein(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--protein', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 4 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + + +def test_do_sourmash_compute_multik_with_dayhoff(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--dayhoff', '--no-dna', + testdata1], + in_directory=location) + assert 'Computing only Dayhoff-encoded protein (and not nucleotide) ' \ + 'signatures.' in err + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + assert all(x.minhash.dayhoff for x in siglist) + + +def test_do_sourmash_compute_multik_with_dayhoff_and_dna(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--dayhoff', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 4 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 + assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 + + +def test_do_sourmash_compute_multik_with_dayhoff_dna_protein(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--dayhoff', '--protein', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 6 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 + assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 + assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2 + + +def test_do_sourmash_compute_multik_with_nothing(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--no-protein', '--no-dna', + testdata1], + in_directory=location, + fail_ok=True) + outfile = os.path.join(location, 'short.fa.sig') + assert not os.path.exists(outfile) + + +def test_do_sourmash_compute_multik_protein_bad_ksize(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '20,32', + '--protein', '--no-dna', + testdata1], + in_directory=location, + fail_ok=True) + outfile = os.path.join(location, 'short.fa.sig') + assert not os.path.exists(outfile) + assert 'protein ksizes must be divisible by 3' in err + + +@utils.in_tempdir +def test_do_sourmash_compute_multik_only_protein(c): + # check sourmash compute with only protein, no nucl + testdata1 = utils.get_test_data('short.fa') + c.run_sourmash('compute', '-k', '21,30', + '--protein', '--no-dna', testdata1) + outfile = os.path.join(c.location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + + +def test_do_sourmash_compute_multik_protein_input_non_div3_ksize(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short-protein.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '20,32', + '--protein', '--no-dna', + '--input-is-protein', + testdata1], + in_directory=location, + fail_ok=True) + outfile = os.path.join(location, 'short-protein.fa.sig') + assert os.path.exists(outfile) + + +@utils.in_tempdir +def test_do_sourmash_compute_multik_only_protein_no_rna(c): + # test --no-rna as well (otherwise identical to previous test) + testdata1 = utils.get_test_data('short.fa') + + c.run_sourmash('compute', '-k', '21,30', + '--protein', '--no-rna', testdata1) + outfile = os.path.join(c.location, 'short.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + + +def test_do_sourmash_compute_protein_bad_sequences(): + """Proper error handling when Ns in dna sequence""" + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.bad.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--protein', '--no-dna', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'short.bad.fa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + + +def test_do_sourmash_compute_multik_input_is_protein(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('ecoli.faa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,30', + '--input-is-protein', + testdata1], + in_directory=location) + outfile = os.path.join(location, 'ecoli.faa.sig') + assert os.path.exists(outfile) + + with open(outfile, 'rt') as fp: + sigdata = fp.read() + siglist = list(signature.load_signatures(sigdata)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 30 in ksizes + + moltype = set([ x.minhash.is_molecule_type('protein') + for x in siglist ]) + assert len(moltype) == 1 + assert True in moltype + + +def test_do_sourmash_compute_multik_outfile(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + ksizes = set([ x.minhash.ksize for x in siglist ]) + assert 21 in ksizes + assert 31 in ksizes + + +def test_do_sourmash_compute_with_scaled_1(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '1', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.minhash.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == { sourmash.MAX_HASH } + + +def test_do_sourmash_compute_with_scaled_2(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '2', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.minhash.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == set([ int(2**64 /2.) ]) + + +def test_do_sourmash_compute_with_scaled(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '100', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.minhash.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == set([ int(2**64 /100.) ]) + + +def test_do_sourmash_compute_with_bad_scaled(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '-1', + testdata1, '-o', outfile], + in_directory=location, + fail_ok=True) + + assert status != 0 + assert '--scaled value must be >= 1' in err + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '1000.5', + testdata1, '-o', outfile], + in_directory=location, + fail_ok=True) + + assert status != 0 + assert '--scaled value must be integer value' in err + + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '1e9', + testdata1, '-o', outfile], + in_directory=location) + + assert status == 0 + assert 'WARNING: scaled value is nonsensical!?' in err + + +def test_do_sourmash_compute_with_seed(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--seed', '43', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + seeds = [ x.minhash.seed for x in siglist ] + assert len(seeds) == 2 + assert set(seeds) == set([ 43 ]) + + +def test_do_sourmash_check_protein_comparisons(): + # this test checks 2 x 2 protein comparisons with E. coli genes. + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('ecoli.faa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--input-is-protein', + '--singleton', + testdata1], + in_directory=location) + sig1 = os.path.join(location, 'ecoli.faa.sig') + assert os.path.exists(sig1) + + testdata2 = utils.get_test_data('ecoli.genes.fna') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--protein', '--no-dna', + '--singleton', + testdata2], + in_directory=location) + sig2 = os.path.join(location, 'ecoli.genes.fna.sig') + assert os.path.exists(sig2) + + # I'm not sure why load_signatures is randomizing order, but ok. + x = list(signature.load_signatures(sig1)) + sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name()) + + x = list(signature.load_signatures(sig2)) + sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) + + name1 = sig1_aa.name().split()[0] + assert name1 == 'NP_414543.1' + name2 = sig2_aa.name().split()[0] + assert name2 == 'NP_414544.1' + name3 = sig1_trans.name().split()[0] + assert name3 == 'gi|556503834:2801-3733' + name4 = sig2_trans.name().split()[0] + assert name4 == 'gi|556503834:337-2799' + + print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) + print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) + print(name1, name4, round(sig1_aa.similarity(sig2_trans), 3)) + print(name2, name4, round(sig2_aa.similarity(sig2_trans), 3)) + + assert round(sig1_aa.similarity(sig1_trans), 3) == 0.0 + assert round(sig2_aa.similarity(sig1_trans), 3) == 0.166 + assert round(sig1_aa.similarity(sig2_trans), 3) == 0.174 + assert round(sig2_aa.similarity(sig2_trans), 3) == 0.0 + + +@utils.in_tempdir +def test_do_sourmash_check_knowngood_dna_comparisons(c): + # this test checks against a known good signature calculated + # by utils/compute-dna-mh-another-way.py + testdata1 = utils.get_test_data('ecoli.genes.fna') + c.run_sourmash('compute', '-k', '21', + '--singleton', '--dna', + testdata1) + sig1 = c.output('ecoli.genes.fna.sig') + assert os.path.exists(sig1) + + x = list(signature.load_signatures(sig1)) + sig1, sig2 = sorted(x, key=lambda x: x.name()) + + knowngood = utils.get_test_data('benchmark.dna.sig') + good = list(signature.load_signatures(knowngood))[0] + + assert sig2.similarity(good) == 1.0 + + +@utils.in_tempdir +def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): + # check the --rna flag; otherwise identical to previous test. + testdata1 = utils.get_test_data('ecoli.genes.fna') + c.run_sourmash('compute', '-k', '21', '--singleton', '--rna', + testdata1) + sig1 = c.output('ecoli.genes.fna.sig') + assert os.path.exists(sig1) + + x = list(signature.load_signatures(sig1)) + sig1, sig2 = sorted(x, key=lambda x: x.name()) + + knowngood = utils.get_test_data('benchmark.dna.sig') + good = list(signature.load_signatures(knowngood))[0] + + assert sig2.similarity(good) == 1.0 + + +def test_do_sourmash_check_knowngood_input_protein_comparisons(): + # this test checks against a known good signature calculated + # by utils/compute-input-prot-another-way.py + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('ecoli.faa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--input-is-protein', + '--singleton', + testdata1], + in_directory=location) + sig1 = os.path.join(location, 'ecoli.faa.sig') + assert os.path.exists(sig1) + + x = list(signature.load_signatures(sig1)) + sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name()) + + knowngood = utils.get_test_data('benchmark.input_prot.sig') + good_aa = list(signature.load_signatures(knowngood))[0] + + assert sig1_aa.similarity(good_aa) == 1.0 + + +def test_do_sourmash_check_knowngood_protein_comparisons(): + # this test checks against a known good signature calculated + # by utils/compute-prot-mh-another-way.py + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('ecoli.genes.fna') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21', + '--singleton', '--protein', + '--no-dna', + testdata1], + in_directory=location) + sig1 = os.path.join(location, 'ecoli.genes.fna.sig') + assert os.path.exists(sig1) + + x = list(signature.load_signatures(sig1)) + sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name()) + + knowngood = utils.get_test_data('benchmark.prot.sig') + good_trans = list(signature.load_signatures(knowngood))[0] + + assert sig2_trans.similarity(good_trans) == 1.0 diff --git a/tests/test_tenx.py b/tests/test_tenx.py new file mode 100644 index 0000000000..b585303ab8 --- /dev/null +++ b/tests/test_tenx.py @@ -0,0 +1,128 @@ +import pytest +pytest.importorskip('pysam') + +from . import sourmash_tst_utils as utils +import sourmash.tenx as sourmash_tenx +import pysam as bs + + +def test_read_barcodes_file(): + filename = utils.get_test_data('10x-example/barcodes.tsv') + barcodes = sourmash_tenx.read_barcodes_file(filename) + assert len(barcodes) == 10 + + +def test_read_bam_file(): + filename = utils.get_test_data('10x-example/possorted_genome_bam.bam') + bam_file = sourmash_tenx.read_bam_file(filename) + assert isinstance(bam_file, bs.AlignmentFile) + total_alignments = sum(1 for _ in bam_file) + assert total_alignments == 1714 + + +def test_shard_bam_file(): + filename = utils.get_test_data('10x-example/possorted_genome_bam.bam') + bam_file = sourmash_tenx.read_bam_file(filename) + assert isinstance(bam_file, bs.AlignmentFile) + + expected_alignments = sum(1 for _ in bam_file) + with utils.TempDirectory() as location: + bam_shard_files = sourmash_tenx.shard_bam_file( + filename, expected_alignments, location) + assert len(bam_shard_files) == 1 + + num_shards = 2 + with utils.TempDirectory() as location: + bam_shard_files = sourmash_tenx.shard_bam_file( + filename, expected_alignments // num_shards, location) + assert len(bam_shard_files) == 2 + + total_alignments = 0 + for bam_file in bam_shard_files: + total_alignments += sum(1 for _ in sourmash_tenx.read_bam_file(bam_file)) + assert total_alignments == expected_alignments + + whole_bam_file = sourmash_tenx.read_bam_file(filename) + for bam_file in bam_shard_files: + for line in sourmash_tenx.read_bam_file(bam_file): + assert line == next(whole_bam_file) + + +def test_pass_alignment_qc(): + barcodes = sourmash_tenx.read_barcodes_file( + utils.get_test_data('10x-example/barcodes.tsv')) + bam = sourmash_tenx.read_bam_file( + utils.get_test_data('10x-example/possorted_genome_bam.bam')) + + total_pass = sum(1 for alignment in bam if + sourmash_tenx.pass_alignment_qc(alignment, barcodes)) + assert total_pass == 439 + + +def test_pass_alignment_qc_filtered(): + bam = sourmash_tenx.read_bam_file( + utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam')) + total_alignments = sum(1 for _ in bam) + bam = sourmash_tenx.read_bam_file( + utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam')) + assert total_alignments == 1500 + total_pass = sum(1 for alignment in bam if + sourmash_tenx.pass_alignment_qc(alignment, None)) + assert total_pass == 192 + + +def test_parse_barcode_renamer(): + filename = utils.get_test_data('10x-example/barcodes.tsv') + barcodes = sourmash_tenx.read_barcodes_file(filename) + renamer = sourmash_tenx.parse_barcode_renamer(barcodes, None) + for key, value in renamer.items(): + assert key == value + assert len(renamer) == len(barcodes) + + renamer = sourmash_tenx.parse_barcode_renamer( + barcodes, utils.get_test_data('10x-example/barcodes_renamer.tsv')) + for key, value in renamer.items(): + assert key in value + assert "epithelial_cell" in value + assert len(renamer) == len(barcodes) + + +def test_bam_to_fasta(): + filename = utils.get_test_data('10x-example/barcodes.tsv') + bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam') + barcodes = sourmash_tenx.read_barcodes_file(filename) + + fastas = sourmash_tenx.bam_to_fasta( + barcodes=barcodes, barcode_renamer=None, delimiter="X", umi_filter=False, bam_file=bam_file) + assert len(list(fastas)) == 8 + + +def test_filtered_bam_to_fasta(): + bam_file = utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam') + fastas = sourmash_tenx.bam_to_fasta( + barcodes=None, barcode_renamer=None, delimiter='X', umi_filter=False, bam_file=bam_file) + assert len(list(fastas)) == 32 + + +def test_filtered_bam_to_umi_fasta(): + bam_file = utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam') + fastas = sourmash_tenx.bam_to_fasta( + barcodes=None, barcode_renamer=None, delimiter='X', umi_filter=True, bam_file=bam_file) + assert len(list(fastas)) == 32 + + +def test_write_sequences(): + cell_sequences = {'AAATGCCCAAACTGCT-1': "atgc", 'AAATGCCCAAAGTGCT-1': "gtga"} + fastas = list(sourmash_tenx.write_cell_sequences(cell_sequences)) + assert len(fastas) == len(cell_sequences) + for fasta in fastas: + assert fasta.endswith(".fasta") + + +def test_write_sequences_umi(): + cell_sequences = {'AAATGCCCAXAACTGCT-1': "atgc", 'AAATGCCXCAAAGTGCT-1': "gtga", 'AAATGCCXCAAAGTGCT-2': "gtgc"} + fastas = list(sourmash_tenx.write_cell_sequences(cell_sequences, True)) + assert len(fastas) == len(cell_sequences) + for fasta in fastas: + assert fasta.endswith(".fasta") + diff --git a/tox.ini b/tox.ini index 5afa1e9c42..93ffa78cc1 100644 --- a/tox.ini +++ b/tox.ini @@ -9,7 +9,7 @@ deps= codecov ipfshttpclient redis - bamnostic + pysam pathos commands= pip install -r requirements.txt