Skip to content

Commit

Permalink
[MRG] Move greyhound-core into sourmash (#1238)
Browse files Browse the repository at this point in the history
* move greyhound-core into sourmash
* Basic colors implementation. Similar to first Mantis paper, but without using bitvectors.
* Read-only ZipStorage based on piz, pending perf optimization
  • Loading branch information
luizirber authored Feb 14, 2022
1 parent f9a2e96 commit 13bf0d5
Show file tree
Hide file tree
Showing 24 changed files with 1,863 additions and 52 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ include/sourmash.h: src/core/src/lib.rs \
src/core/src/ffi/minhash.rs \
src/core/src/ffi/signature.rs \
src/core/src/ffi/nodegraph.rs \
src/core/src/ffi/index/mod.rs \
src/core/src/ffi/index/revindex.rs \
src/core/src/errors.rs
cd src/core && \
RUSTC_BOOTSTRAP=1 cbindgen -c cbindgen.toml . -o ../../$@
Expand Down
49 changes: 49 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ typedef struct SourmashKmerMinHash SourmashKmerMinHash;

typedef struct SourmashNodegraph SourmashNodegraph;

typedef struct SourmashRevIndex SourmashRevIndex;

typedef struct SourmashSearchResult SourmashSearchResult;

typedef struct SourmashSignature SourmashSignature;

/**
Expand Down Expand Up @@ -302,6 +306,51 @@ SourmashNodegraph *nodegraph_with_tables(uintptr_t ksize,
uintptr_t starting_size,
uintptr_t n_tables);

void revindex_free(SourmashRevIndex *ptr);

const SourmashSearchResult *const *revindex_gather(const SourmashRevIndex *ptr,
const SourmashSignature *sig_ptr,
double threshold,
bool _do_containment,
bool _ignore_abundance,
uintptr_t *size);

uint64_t revindex_len(const SourmashRevIndex *ptr);

SourmashRevIndex *revindex_new_with_paths(const SourmashStr *const *search_sigs_ptr,
uintptr_t insigs,
const SourmashKmerMinHash *template_ptr,
uintptr_t threshold,
const SourmashKmerMinHash *const *queries_ptr,
uintptr_t inqueries,
bool keep_sigs);

SourmashRevIndex *revindex_new_with_sigs(const SourmashSignature *const *search_sigs_ptr,
uintptr_t insigs,
const SourmashKmerMinHash *template_ptr,
uintptr_t threshold,
const SourmashKmerMinHash *const *queries_ptr,
uintptr_t inqueries);

uint64_t revindex_scaled(const SourmashRevIndex *ptr);

const SourmashSearchResult *const *revindex_search(const SourmashRevIndex *ptr,
const SourmashSignature *sig_ptr,
double threshold,
bool do_containment,
bool _ignore_abundance,
uintptr_t *size);

SourmashSignature **revindex_signatures(const SourmashRevIndex *ptr, uintptr_t *size);

SourmashStr searchresult_filename(const SourmashSearchResult *ptr);

void searchresult_free(SourmashSearchResult *ptr);

double searchresult_score(const SourmashSearchResult *ptr);

SourmashSignature *searchresult_signature(const SourmashSearchResult *ptr);

void signature_add_protein(SourmashSignature *ptr, const char *sequence);

void signature_add_sequence(SourmashSignature *ptr, const char *sequence, bool force);
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@


def build_native(spec):
cmd = ["cargo", "build", "--manifest-path", "src/core/Cargo.toml", "--lib"]
cmd = ["cargo", "build",
"--manifest-path", "src/core/Cargo.toml",
# "--features", "parallel",
"--lib"]

target = "debug"
if not DEBUG_BUILD:
Expand Down
6 changes: 6 additions & 0 deletions src/core/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased

Added:

- An inverted index, codename Greyhound (#1238)

## [0.11.0] - 2021-07-07

Added:
Expand Down
5 changes: 5 additions & 0 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ az = "1.0.0"
bytecount = "0.6.0"
byteorder = "1.4.3"
cfg-if = "1.0"
counter = "0.5.2"
finch = { version = "0.4.1", optional = true }
fixedbitset = "0.4.0"
getset = "0.1.1"
Expand All @@ -42,6 +43,10 @@ serde_json = "1.0.53"
primal-check = "0.3.1"
thiserror = "1.0"
typed-builder = "0.9.0"
twox-hash = "1.6.0"
vec-collections = "0.3.4"
piz = "0.4.0"
memmap2 = "0.5.0"

[dev-dependencies]
assert_matches = "1.3.0"
Expand Down
1 change: 1 addition & 0 deletions src/core/cbindgen.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ clean = true

[parse.expand]
crates = ["sourmash"]
features = []

[enum]
rename_variants = "QualifiedScreamingSnakeCase"
Expand Down
213 changes: 213 additions & 0 deletions src/core/src/encodings.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::convert::TryFrom;
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::iter::Iterator;
use std::str;

use nohash_hasher::BuildNoHashHasher;
use once_cell::sync::Lazy;

use crate::Error;

// To consider there: use a slab allocator for IdxTracker
// https://twitter.com/tomaka17/status/1391052081272967170
// Pro-tip: you might be able to save a lot of hashmap lookups
// if you replace a `HashMap<K, V>` with a `HashMap<K, usize>`
// and a `Slab<V>`. This might be very useful if K is something
// heavy such as a `String`.
pub type Color = u64;
pub type Idx = u64;
type IdxTracker = (vec_collections::VecSet<[Idx; 4]>, u64);
type ColorToIdx = HashMap<Color, IdxTracker, BuildNoHashHasher<Color>>;

#[allow(non_camel_case_types)]
#[derive(Debug, Clone, Copy, PartialEq)]
#[repr(u32)]
Expand Down Expand Up @@ -357,3 +371,202 @@ pub const VALID: [bool; 256] = {
lookup[b'T' as usize] = true;
lookup
};

#[derive(Serialize, Deserialize, Default)]
pub struct Colors {
colors: ColorToIdx,
}

impl Colors {
pub fn new() -> Colors {
Default::default()
}

/// Given a color and a new idx, return an updated color
///
/// This might create a new one, or find an already existing color
/// that contains the new_idx
///
/// Future optimization: store a count for each color, so we can track
/// if there are extra colors that can be removed at the end.
/// (the count is decreased whenever a new color has to be created)
pub fn update<'a, I: IntoIterator<Item = &'a Idx>>(
&mut self,
current_color: Option<Color>,
new_idxs: I,
) -> Result<Color, Error> {
if let Some(color) = current_color {
if let Some(idxs) = self.colors.get_mut(&color) {
let idx_to_add: Vec<_> = new_idxs
.into_iter()
.filter(|new_idx| !idxs.0.contains(new_idx))
.collect();

if idx_to_add.is_empty() {
// Easy case, it already has all the new_idxs, so just return this color
idxs.1 += 1;
Ok(color)
} else {
// We need to either create a new color,
// or find an existing color that have the same idxs

let mut idxs = idxs.clone();
idxs.0.extend(idx_to_add.into_iter().cloned());
let new_color = Colors::compute_color(&idxs);

if new_color != color {
self.colors.get_mut(&color).unwrap().1 -= 1;
if self.colors[&color].1 == 0 {
self.colors.remove(&color);
};
};

self.colors
.entry(new_color)
.and_modify(|old_idxs| {
assert_eq!(old_idxs.0, idxs.0);
old_idxs.1 += 1;
})
.or_insert_with(|| (idxs.0, 1));
Ok(new_color)
}
} else {
unimplemented!("throw error, current_color must exist in order to be updated. current_color: {:?}, colors: {:#?}", current_color, &self.colors);
}
} else {
let mut idxs = IdxTracker::default();
idxs.0.extend(new_idxs.into_iter().cloned());
idxs.1 = 1;
let new_color = Colors::compute_color(&idxs);
self.colors
.entry(new_color)
.and_modify(|old_idxs| {
assert_eq!(old_idxs.0, idxs.0);
old_idxs.1 += 1;
})
.or_insert_with(|| (idxs.0, 1));
Ok(new_color)
}
}

fn compute_color(idxs: &IdxTracker) -> Color {
let s = BuildHasherDefault::<twox_hash::Xxh3Hash128>::default();
let mut hasher = s.build_hasher();
idxs.0.hash(&mut hasher);
hasher.finish()
}

pub fn len(&self) -> usize {
self.colors.len()
}

pub fn is_empty(&self) -> bool {
self.colors.is_empty()
}

pub fn contains(&self, color: Color, idx: Idx) -> bool {
if let Some(idxs) = self.colors.get(&color) {
idxs.0.contains(&idx)
} else {
false
}
}

pub fn indices(&self, color: &Color) -> Indices {
// TODO: what if color is not present?
Indices {
iter: self.colors.get(color).unwrap().0.iter(),
}
}

pub fn retain<F>(&mut self, f: F)
where
F: FnMut(&Color, &mut IdxTracker) -> bool,
{
self.colors.retain(f)
}
}

pub struct Indices<'a> {
iter: vec_collections::VecSetIter<core::slice::Iter<'a, Idx>>,
}

impl<'a> Iterator for Indices<'a> {
type Item = &'a Idx;

fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn colors_update() {
let mut colors = Colors::new();

let color = colors.update(None, &[1_u64]).unwrap();
assert_eq!(colors.len(), 1);

dbg!("update");
let new_color = colors.update(Some(color), &[1_u64]).unwrap();
assert_eq!(colors.len(), 1);
assert_eq!(color, new_color);

dbg!("upgrade");
let new_color = colors.update(Some(color), &[2_u64]).unwrap();
assert_eq!(colors.len(), 2);
assert_ne!(color, new_color);
}

#[test]
fn colors_retain() {
let mut colors = Colors::new();

let color1 = colors.update(None, &[1_u64]).unwrap();
assert_eq!(colors.len(), 1);
// used_colors:
// color1: 1

dbg!("update");
let same_color = colors.update(Some(color1), &[1_u64]).unwrap();
assert_eq!(colors.len(), 1);
assert_eq!(color1, same_color);
// used_colors:
// color1: 2

dbg!("upgrade");
let color2 = colors.update(Some(color1), &[2_u64]).unwrap();
assert_eq!(colors.len(), 2);
assert_ne!(color1, color2);
// used_colors:
// color1: 1
// color2: 1

dbg!("update");
let same_color = colors.update(Some(color2), &[2_u64]).unwrap();
assert_eq!(colors.len(), 2);
assert_eq!(color2, same_color);
// used_colors:
// color1: 1
// color1: 2

dbg!("upgrade");
let color3 = colors.update(Some(color1), &[3_u64]).unwrap();
assert_ne!(color1, color3);
assert_ne!(color2, color3);
// used_colors:
// color1: 0
// color2: 2
// color3: 1

// This is the pre color-count tracker, where it is needed
// to call retain to maintain colors
//assert_eq!(colors.len(), 3);
//colors.retain(|c, _| [color2, color3].contains(c));

assert_eq!(colors.len(), 2);
}
}
37 changes: 37 additions & 0 deletions src/core/src/ffi/index/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
pub mod revindex;

use crate::signature::Signature;

use crate::ffi::signature::SourmashSignature;
use crate::ffi::utils::{ForeignObject, SourmashStr};

pub struct SourmashSearchResult;

impl ForeignObject for SourmashSearchResult {
type RustObject = (f64, Signature, String);
}

#[no_mangle]
pub unsafe extern "C" fn searchresult_free(ptr: *mut SourmashSearchResult) {
SourmashSearchResult::drop(ptr);
}

#[no_mangle]
pub unsafe extern "C" fn searchresult_score(ptr: *const SourmashSearchResult) -> f64 {
let result = SourmashSearchResult::as_rust(ptr);
result.0
}

#[no_mangle]
pub unsafe extern "C" fn searchresult_filename(ptr: *const SourmashSearchResult) -> SourmashStr {
let result = SourmashSearchResult::as_rust(ptr);
(result.2).clone().into()
}

#[no_mangle]
pub unsafe extern "C" fn searchresult_signature(
ptr: *const SourmashSearchResult,
) -> *mut SourmashSignature {
let result = SourmashSearchResult::as_rust(ptr);
SourmashSignature::from_rust((result.1).clone())
}
Loading

0 comments on commit 13bf0d5

Please sign in to comment.