Skip to content

Commit

Permalink
Merge branch 'master' into feature/v5_loading
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber authored Sep 25, 2019
2 parents 02af7c1 + 878540f commit f736a75
Show file tree
Hide file tree
Showing 35 changed files with 2,715 additions and 1,045 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ sourmash/_minhash.cpp
.pytest_cache
.python-version
sourmash/version.py
*.DS_Store
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ needletail = { version = "~0.2.1", optional = true }
serde = "1.0"
serde_derive = "~1.0.58"
serde_json = "1.0.2"
ukhs = "0.3.4"
ukhs = { git = "https://github.com/luizirber/ukhs", branch = "feature/alternative_backends", features = ["boomphf_mphf"], default-features = false}
bio = { git = "https://github.com/luizirber/rust-bio", branch = "feature/fastx_reader" }
primal = "0.2.3"
pdatastructs = "0.5.0"
pdatastructs = { git = "https://github.com/luizirber/pdatastructs.rs", branch = "succinct_wasm" }
itertools = "0.8.0"
typed-builder = "0.3.0"
csv = "1.0.7"
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ doc: .PHONY
coverage: all
$(PYTHON) setup.py clean --all
SOURMASH_COVERAGE=1 $(PYTHON) setup.py build_ext -i
$(PYTHON) -m pytest --cov=.
$(PYTHON) -m pytest --cov=. --cov-report term-missing

benchmark: all
asv continuous master
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ sourmash is a product of the
We recommend using bioconda to install sourmash:

```
conda install sourmash
conda install -c conda-forge -c bioconda sourmash
```
This will install the latest stable version of sourmash 2.

Expand Down
Binary file modified doc/_static/cmp.matrix.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 10 additions & 3 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,29 @@ taken.
Grab three bacterial genomes from NCBI:
```
curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Escherichia_coli/reference/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz
curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Salmonella_enterica/reference/GCF_000006945.1_ASM694v1/GCF_000006945.1_ASM694v1_genomic.fna.gz
curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Salmonella_enterica/reference/GCF_000006945.2_ASM694v2/GCF_000006945.2_ASM694v2_genomic.fna.gz
curl -L -O ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Sphingobacteriaceae_bacterium_DW12/latest_assembly_versions/GCF_000783305.1_ASM78330v1/GCF_000783305.1_ASM78330v1_genomic.fna.gz
```
Compute signatures for each:
```
sourmash compute *.fna.gz
sourmash compute -k 31 *.fna.gz
```
This will produce three `.sig` files containing MinHash signatures at k=31.

Next, compare all the signatures to each other:
```
sourmash compare *.sig -o cmp
```

Optionally, parallelize compare to 8 threads with `-p 8`:

```
sourmash compare -p 8 *.sig -o cmp
```

Finally, plot a dendrogram:
```
sourmash plot cmp
sourmash plot cmp --labels
```
This will output two files, `cmp.dendro.png` and `cmp.matrix.png`,
containing a clustering & dendrogram of the sequences, as well as a
Expand Down
6 changes: 6 additions & 0 deletions doc/tutorial-basic.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ Compare all the things:
sourmash compare ecoli_many_sigs/* -o ecoli_cmp
```

Optionally, parallelize to 8 threads using `-p 8`:

```
sourmash compare -p 8 ecoli_many_sigs/* -o ecoli_cmp
```

and then plot:

```
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@
'setuptools_scm', 'setuptools_scm_git_archive'],
"use_scm_version": {"write_to": "sourmash/version.py"},
"extras_require": {
'test' : ['pytest', 'pytest-cov', 'numpy', 'matplotlib', 'scipy','recommonmark'],
'test' : ['pytest', 'pytest-cov', 'numpy', 'matplotlib', 'scipy', 'recommonmark'],
'demo' : ['jupyter', 'jupyter_client', 'ipython'],
'doc' : ['sphinx'],
'10x': ['pathos', 'bamnostic>=0.9.2'],
'10x': ['pathos', 'pysam']
},
"include_package_data": True,
"package_data": {
Expand Down
9 changes: 7 additions & 2 deletions sourmash/_minhash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,34 @@ cdef extern from "kmer_min_hash.hh":
const unsigned int num;
const unsigned int ksize;
const bool is_protein;
const bool dayhoff;
const HashIntoType max_hash;
CMinHashType mins;

KmerMinHash(unsigned int, unsigned int, bool, uint32_t, HashIntoType)
KmerMinHash(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType)
void add_hash(HashIntoType) except +ValueError
void remove_hash(HashIntoType) except +ValueError
void add_word(string word) except +ValueError
void add_sequence(const char *, bool) except +ValueError
void merge(const KmerMinHash&) except +ValueError
string aa_to_dayhoff(string aa) except +ValueError
string translate_codon(string codon) except +ValueError
unsigned int count_common(const KmerMinHash&) except +ValueError
unsigned long size()


cdef cppclass KmerMinAbundance(KmerMinHash):
CMinHashType abunds;

KmerMinAbundance(unsigned int, unsigned int, bool, uint32_t, HashIntoType)
KmerMinAbundance(unsigned int, unsigned int, bool, bool, uint32_t, HashIntoType)
void add_hash(HashIntoType) except +ValueError
void remove_hash(HashIntoType) except +ValueError
void add_word(string word) except +ValueError
void add_sequence(const char *, bool) except +ValueError
void merge(const KmerMinAbundance&) except +ValueError
void merge(const KmerMinHash&) except +ValueError
string aa_to_dayhoff(string aa) except +ValueError
string translate_codon(string codon) except +ValueError
unsigned int count_common(const KmerMinAbundance&) except +ValueError
unsigned long size()

Expand Down
63 changes: 49 additions & 14 deletions sourmash/_minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,15 @@ def get_scaled_for_max_hash(max_hash):


cdef bytes to_bytes(s):
if not isinstance(s, (basestring, bytes)):
# Allow for strings, bytes or int
# Single item of byte string = int
if not isinstance(s, (basestring, bytes, int)):
raise TypeError("Requires a string-like sequence")

if isinstance(s, unicode):
s = s.encode('utf-8')
if isinstance(s, int):
s = bytes([s])
return s


Expand Down Expand Up @@ -88,6 +92,7 @@ cdef class MinHash(object):

def __init__(self, unsigned int n, unsigned int ksize,
bool is_protein=False,
bool dayhoff=False,
bool track_abundance=False,
uint32_t seed=MINHASH_DEFAULT_SEED,
HashIntoType max_hash=0,
Expand All @@ -107,9 +112,9 @@ cdef class MinHash(object):

cdef KmerMinHash *mh = NULL
if track_abundance:
mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash)
mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash)
else:
mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash)
mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash)

self._this.reset(mh)

Expand All @@ -122,7 +127,8 @@ cdef class MinHash(object):

def __copy__(self):
a = MinHash(deref(self._this).num, deref(self._this).ksize,
deref(self._this).is_protein, self.track_abundance,
deref(self._this).is_protein, deref(self._this).dayhoff,
self.track_abundance,
deref(self._this).seed, deref(self._this).max_hash)
a.merge(self)
return a
Expand All @@ -135,23 +141,24 @@ cdef class MinHash(object):
return (deref(self._this).num,
deref(self._this).ksize,
deref(self._this).is_protein,
deref(self._this).dayhoff,
self.get_mins(with_abundance=with_abundance),
None, self.track_abundance, deref(self._this).max_hash,
deref(self._this).seed)

def __setstate__(self, tup):
(n, ksize, is_protein, mins, _, track_abundance, max_hash, seed) =\
(n, ksize, is_protein, dayhoff, mins, _, track_abundance, max_hash, seed) =\
tup

self.track_abundance = track_abundance

cdef KmerMinHash *mh = NULL
if track_abundance:
mh = new KmerMinAbundance(n, ksize, is_protein, seed, max_hash)
mh = new KmerMinAbundance(n, ksize, is_protein, dayhoff, seed, max_hash)
self._this.reset(mh)
self.set_abundances(mins)
else:
mh = new KmerMinHash(n, ksize, is_protein, seed, max_hash)
mh = new KmerMinHash(n, ksize, is_protein, dayhoff, seed, max_hash)
self._this.reset(mh)
self.add_many(mins)

Expand All @@ -160,6 +167,7 @@ cdef class MinHash(object):
(deref(self._this).num,
deref(self._this).ksize,
deref(self._this).is_protein,
deref(self._this).dayhoff,
self.track_abundance,
deref(self._this).seed,
deref(self._this).max_hash,
Expand All @@ -173,7 +181,8 @@ cdef class MinHash(object):

def copy_and_clear(self):
a = MinHash(deref(self._this).num, deref(self._this).ksize,
deref(self._this).is_protein, self.track_abundance,
deref(self._this).is_protein, deref(self._this).dayhoff,
self.track_abundance,
deref(self._this).seed, deref(self._this).max_hash)
return a

Expand Down Expand Up @@ -234,6 +243,10 @@ cdef class MinHash(object):
def is_protein(self):
return deref(self._this).is_protein

@property
def dayhoff(self):
return deref(self._this).dayhoff

@property
def ksize(self):
return deref(self._this).ksize
Expand All @@ -247,6 +260,9 @@ cdef class MinHash(object):
def add_hash(self, uint64_t h):
deref(self._this).add_hash(h)

def translate_codon(self, codon):
return deref(self._this).translate_codon(to_bytes(codon))

def count_common(self, MinHash other):
return deref(self._this).count_common(deref(other._this))

Expand All @@ -255,7 +271,8 @@ cdef class MinHash(object):
raise ValueError('new sample n is higher than current sample n')

a = MinHash(new_num, deref(self._this).ksize,
deref(self._this).is_protein, self.track_abundance,
deref(self._this).is_protein, deref(self._this).dayhoff,
self.track_abundance,
deref(self._this).seed, 0)
if self.track_abundance:
a.set_abundances(self.get_mins(with_abundance=True))
Expand Down Expand Up @@ -286,7 +303,8 @@ cdef class MinHash(object):
new_max_hash = get_max_hash_for_scaled(new_num)

a = MinHash(0, deref(self._this).ksize,
deref(self._this).is_protein, self.track_abundance,
deref(self._this).is_protein, deref(self._this).dayhoff,
self.track_abundance,
deref(self._this).seed, new_max_hash)
if self.track_abundance:
a.set_abundances(self.get_mins(with_abundance=True))
Expand All @@ -307,13 +325,15 @@ cdef class MinHash(object):
combined_mh = new KmerMinAbundance(num,
deref(self._this).ksize,
deref(self._this).is_protein,
deref(self._this).dayhoff,
deref(self._this).seed,
deref(self._this).max_hash)

else:
combined_mh = new KmerMinHash(num,
deref(self._this).ksize,
deref(self._this).is_protein,
deref(self._this).dayhoff,
deref(self._this).seed,
deref(self._this).max_hash)

Expand Down Expand Up @@ -424,12 +444,27 @@ cdef class MinHash(object):
if not deref(self._this).is_protein:
raise ValueError("cannot add amino acid sequence to DNA MinHash!")

for i in range(0, len(sequence) - ksize + 1):
deref(self._this).add_word(to_bytes(sequence[i:i + ksize]))
aa_kmers = (sequence[i:i + ksize] for i in range(0, len(sequence) - ksize + 1))
if not self.dayhoff:
for aa_kmer in aa_kmers:
deref(self._this).add_word(to_bytes(aa_kmer))
else:
for aa_kmer in aa_kmers:
dayhoff_kmer = ''
for aa in aa_kmer:
dayhoff_letter = deref(self._this).aa_to_dayhoff(to_bytes(aa))
dayhoff_kmer += dayhoff_letter
# dayhoff_kmer = ''.join( for aa in aa_kmer)
deref(self._this).add_word(to_bytes(dayhoff_kmer))

def is_molecule_type(self, molecule):
if molecule.upper() == 'DNA' and not self.is_protein:
return True
if molecule == 'protein' and self.is_protein:
return True
if self.is_protein:
if self.dayhoff:
if molecule == 'dayhoff':
return True
else:
if molecule == 'protein':
return True
return False
Loading

0 comments on commit f736a75

Please sign in to comment.