Skip to content

Commit

Permalink
Merge branch 'latest' of https://github.com/dib-lab/sourmash into ORC…
Browse files Browse the repository at this point in the history
…IDchecklist
  • Loading branch information
keyabarve committed Mar 29, 2021
2 parents a23abae + 7ed6291 commit def0971
Show file tree
Hide file tree
Showing 11 changed files with 343 additions and 120 deletions.
12 changes: 6 additions & 6 deletions nix/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
"homepage": "",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "5852a21819542e6809f68ba5a798600e69874e76",
"sha256": "05vqlnafz287wamy2a3kp6h32mmha1ahq8gzp7slihdci2ibcdx6",
"rev": "c0e881852006b132236cbf0301bd1939bb50867e",
"sha256": "0fy7z7yxk5n7yslsvx5cyc6h21qwi4bhxf3awhirniszlbvaazy2",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/5852a21819542e6809f68ba5a798600e69874e76.tar.gz",
"url": "https://github.com/NixOS/nixpkgs/archive/c0e881852006b132236cbf0301bd1939bb50867e.tar.gz",
"url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
},
"rust-overlay": {
Expand All @@ -29,10 +29,10 @@
"homepage": null,
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "62d46e74e18babdb1d2b3994dbb91d5fa4672597",
"sha256": "0v7bfn0m0g4yywzxxa8ygll41mb8506kvkfbywdx0g7r6ddvcndr",
"rev": "d414b80c0e6e96977b52b1a0a547ea7613a5c6d5",
"sha256": "14bidf0paxb4hdbq60pxgxijjw7hi5rzdg9vj490rsikk58fb2qs",
"type": "tarball",
"url": "https://github.com/oxalica/rust-overlay/archive/62d46e74e18babdb1d2b3994dbb91d5fa4672597.tar.gz",
"url": "https://github.com/oxalica/rust-overlay/archive/d414b80c0e6e96977b52b1a0a547ea7613a5c6d5.tar.gz",
"url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
}
}
2 changes: 1 addition & 1 deletion src/core/src/ffi/hyperloglog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ unsafe fn hll_from_buffer(ptr: *const c_char, insize: usize) -> Result<*mut Sour
slice::from_raw_parts(ptr as *mut u8, insize)
};

let hll = HyperLogLog::from_reader(&mut &buf[..])?;
let hll = HyperLogLog::from_reader(buf)?;

Ok(SourmashHyperLogLog::from_rust(hll))
}
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/ffi/nodegraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ unsafe fn nodegraph_from_buffer(ptr: *const c_char, insize: usize) -> Result<*mu
slice::from_raw_parts(ptr as *mut u8, insize)
};

let ng = Nodegraph::from_reader(&mut &buf[..])?;
let ng = Nodegraph::from_reader(buf)?;

Ok(SourmashNodegraph::from_rust(ng))
}
Expand Down
3 changes: 3 additions & 0 deletions src/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
//! routines can use the NCBI taxonomy but do not depend on it in any way.
//! Documentation and further examples for each module can be found in the module descriptions below.
// TODO: remove this line and update all the appropriate type names for 1.0
#![allow(clippy::upper_case_acronyms)]

pub mod errors;
pub use errors::SourmashError as Error;

Expand Down
2 changes: 1 addition & 1 deletion src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ impl Signature {

pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Vec<Signature>, Error> {
let mut reader = io::BufReader::new(File::open(path)?);
Ok(Signature::from_reader(&mut reader)?)
Signature::from_reader(&mut reader)
}

pub fn from_reader<R>(rdr: R) -> Result<Vec<Signature>, Error>
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/sketch/hyperloglog/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ impl HyperLogLog {

pub fn from_path<P: AsRef<Path>>(path: P) -> Result<HyperLogLog, Error> {
let mut reader = io::BufReader::new(File::open(path)?);
Ok(HyperLogLog::from_reader(&mut reader)?)
HyperLogLog::from_reader(&mut reader)
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/core/src/sketch/nodegraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ impl Nodegraph {

pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Nodegraph, Error> {
let mut reader = io::BufReader::new(File::open(path)?);
Ok(Nodegraph::from_reader(&mut reader)?)
Nodegraph::from_reader(&mut reader)
}

pub fn tablesizes(&self) -> Vec<u64> {
Expand Down
99 changes: 93 additions & 6 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def select(self, ksize=None, moltype=None):
""

class LinearIndex(Index):
"An Index for a collection of signatures. Can load from a .sig file."
def __init__(self, _signatures=None, filename=None):
self._signatures = []
if _signatures:
Expand Down Expand Up @@ -155,11 +156,97 @@ def load(cls, location):
return lidx

def select(self, ksize=None, moltype=None):
def select_sigs(siglist, ksize, moltype):
for ss in siglist:
if (ksize is None or ss.minhash.ksize == ksize) and \
(moltype is None or ss.minhash.moltype == moltype):
yield ss
def select_sigs(ss, ksize=ksize, moltype=moltype):
if (ksize is None or ss.minhash.ksize == ksize) and \
(moltype is None or ss.minhash.moltype == moltype):
return True

return self.filter(select_sigs)

def filter(self, filter_fn):
siglist = []
for ss in self._signatures:
if filter_fn(ss):
siglist.append(ss)

siglist=select_sigs(self._signatures, ksize, moltype)
return LinearIndex(siglist, self.filename)


class MultiIndex(Index):
"""An Index class that wraps other Index classes.
The MultiIndex constructor takes two arguments: a list of Index
objects, and a matching list of sources (filenames, etc.) If the
source is not None, then it will be used to override the 'filename'
in the triple that is returned by search and gather.
One specific use for this is when loading signatures from a directory;
MultiIndex will properly record which files provided which signatures.
"""
def __init__(self, index_list, source_list):
self.index_list = list(index_list)
self.source_list = list(source_list)
assert len(index_list) == len(source_list)

def signatures(self):
for idx in self.index_list:
for ss in idx.signatures():
yield ss

def __len__(self):
return sum([ len(idx) for idx in self.index_list ])

def insert(self, *args):
raise NotImplementedError

@classmethod
def load(self, *args):
raise NotImplementedError

def save(self, *args):
raise NotImplementedError

def select(self, ksize=None, moltype=None):
new_idx_list = []
new_src_list = []
for idx, src in zip(self.index_list, self.source_list):
idx = idx.select(ksize=ksize, moltype=moltype)
new_idx_list.append(idx)
new_src_list.append(src)

return MultiIndex(new_idx_list, new_src_list)

def filter(self, filter_fn):
new_idx_list = []
new_src_list = []
for idx, src in zip(self.index_list, self.source_list):
idx = idx.filter(filter_fn)
new_idx_list.append(idx)
new_src_list.append(src)

return MultiIndex(new_idx_list, new_src_list)

def search(self, query, *args, **kwargs):
# do the actual search:
matches = []
for idx, src in zip(self.index_list, self.source_list):
for (score, ss, filename) in idx.search(query, *args, **kwargs):
best_src = src or filename # override if src provided
matches.append((score, ss, best_src))

# sort!
matches.sort(key=lambda x: -x[0])
return matches

def gather(self, query, *args, **kwargs):
"Return the match with the best Jaccard containment in the Index."
# actually do search!
results = []
for idx, src in zip(self.index_list, self.source_list):
for (score, ss, filename) in idx.gather(query, *args, **kwargs):
best_src = src or filename # override if src provided
results.append((score, ss, best_src))

results.sort(reverse=True, key=lambda x: (x[0], x[1].md5sum()))

return results
Loading

0 comments on commit def0971

Please sign in to comment.