Skip to content

Commit

Permalink
Merge pull request #9 from will-rowe/dev
Browse files Browse the repository at this point in the history
Merging dev - adding containment search
  • Loading branch information
Will Rowe authored Oct 11, 2018
2 parents 862683e + f5e5b60 commit 0805bd6
Show file tree
Hide file tree
Showing 33 changed files with 891 additions and 405 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

Since version 0.4, `GROOT` will also output the variation graphs which had reads align. These graphs are in [GFA format](https://github.com/GFA-spec/GFA-spec), allowing you to visualise graph alignments using [Bandage](https://github.com/rrwick/Bandage) and determine which variants of a given ARG type are dominant in your metagenomes. Read the [documentation](http://groot-documentation.readthedocs.io/en/latest/?badge=latest) for more info.

Since version 0.8.0, `GROOT` can now optionally use an [LSH Ensemble](https://ekzhu.github.io/datasketch/lshensemble.html) index to enable containment searching. This is thanks to the excellent [method](http://www.vldb.org/pvldb/vol9/p1185-zhu.pdf) and [implementation](https://github.com/ekzhu/lshensemble) of Erkang Zhu. This new index allows the reads of varying read length to be queried against **groot graphs**.

## Installation

Check out the [releases](https://github.com/will-rowe/groot/releases) to download a binary. Alternatively, install using Bioconda or compile the software from source.
Expand Down
24 changes: 17 additions & 7 deletions cmd/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (
"github.com/pkg/profile"
"github.com/spf13/cobra"
"github.com/will-rowe/groot/src/graph"
"github.com/will-rowe/groot/src/lshForest"
"github.com/will-rowe/groot/src/lshIndex"
"github.com/will-rowe/groot/src/misc"
"github.com/will-rowe/groot/src/stream"
"github.com/will-rowe/groot/src/version"
Expand Down Expand Up @@ -187,21 +187,29 @@ func runAlign() {
log.Print("loading index information...")
info := new(misc.IndexInfo)
misc.ErrorCheck(info.Load(*indexDir + "/index.info"))
if info.Containment {
log.Printf("\tindex type: lshEnsemble")
log.Printf("\tcontainment search seeding: enabled")
} else {
log.Printf("\tindex type: lshForest")
log.Printf("\tcontainment search seeding: disabled")
}
log.Printf("\twindow sized used in indexing: %d\n", info.ReadLength)
log.Printf("\tk-mer size: %d\n", info.Ksize)
log.Printf("\tsignature size: %d\n", info.SigSize)
log.Printf("\tJaccard similarity theshold: %0.2f\n", info.JSthresh)
log.Printf("\twindow sized used in indexing: %d\n", info.ReadLength)
log.Print("loading the groot graphs...")
graphStore := make(graph.GraphStore)
misc.ErrorCheck(graphStore.Load(*indexDir + "/index.graph"))
log.Printf("\tnumber of variation graphs: %d\n", len(graphStore))
log.Print("loading the MinHash signatures...")
database := lshForest.NewLSHforest(info.SigSize, info.JSthresh)
var database *lshIndex.LshEnsemble
if info.Containment {
database = lshIndex.NewLSHensemble(make([]lshIndex.Partition, lshIndex.PARTITIONS), info.SigSize, lshIndex.MAXK)
} else {
database = lshIndex.NewLSHforest(info.SigSize, info.JSthresh)
}
misc.ErrorCheck(database.Load(*indexDir + "/index.sigs"))
database.Index()
numHF, numBucks := database.Settings()
log.Printf("\tnumber of hash functions per bucket: %d\n", numHF)
log.Printf("\tnumber of buckets: %d\n", numBucks)
///////////////////////////////////////////////////////////////////////////////////////
// create SAM references from the sequences held in the graphs
referenceMap, err := graphStore.GetRefs()
Expand All @@ -222,10 +230,12 @@ func runAlign() {

// add in the process parameters
dataStream.InputFile = *fastq
fastqChecker.Containment = info.Containment
fastqChecker.WindowSize = info.ReadLength
dbQuerier.Db = database
dbQuerier.CommandInfo = info
dbQuerier.GraphStore = graphStore
dbQuerier.Threshold = info.JSthresh
graphAligner.GraphStore = graphStore
graphAligner.RefMap = referenceMap
graphAligner.MaxClip = *clip
Expand Down
88 changes: 48 additions & 40 deletions cmd/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (
"github.com/spf13/cobra"
"github.com/will-rowe/gfa"
"github.com/will-rowe/groot/src/graph"
"github.com/will-rowe/groot/src/lshForest"
"github.com/will-rowe/groot/src/lshIndex"
"github.com/will-rowe/groot/src/misc"
"github.com/will-rowe/groot/src/seqio"
"github.com/will-rowe/groot/src/version"
Expand All @@ -51,6 +51,7 @@ var (
msaList []string // the collected MSA files
outDir *string // directory to save index files and log to
defaultOutDir = "./groot-index-" + string(time.Now().Format("20060102150405")) // a default dir to store the index files
containment *bool // use lshEnsemble instead of lshForest -- allows for variable read length
)

// the index command (used by cobra)
Expand All @@ -70,10 +71,11 @@ var indexCmd = &cobra.Command{
func init() {
kSize = indexCmd.Flags().IntP("kmerSize", "k", 7, "size of k-mer")
sigSize = indexCmd.Flags().IntP("sigSize", "s", 128, "size of MinHash signature")
readLength = indexCmd.Flags().IntP("readLength", "l", 100, "length of query reads (which will be aligned during the align subcommand)")
jsThresh = indexCmd.Flags().Float64P("jsThresh", "j", 0.99, "minimum Jaccard similarity for a seed to be recorded")
readLength = indexCmd.Flags().IntP("readLength", "l", 100, "max length of query reads (which will be aligned during the align subcommand)")
jsThresh = indexCmd.Flags().Float64P("jsThresh", "j", 0.99, "minimum Jaccard similarity for a seed to be recorded (note: this is used as a containment theshold when --containment set")
msaDir = indexCmd.Flags().StringP("msaDir", "i", "", "directory containing the clustered references (MSA files) - required")
outDir = indexCmd.PersistentFlags().StringP("outDir", "o", defaultOutDir, "directory to save index files to")
containment = indexCmd.Flags().BoolP("containment", "c", false, "use lshEnsemble instead of lshForest (allows for variable read length during alignment)")
indexCmd.MarkFlagRequired("msaDir")
RootCmd.AddCommand(indexCmd)
}
Expand Down Expand Up @@ -136,6 +138,11 @@ func runIndex() {
// check the supplied files and then log some stuff
log.Printf("checking parameters...")
misc.ErrorCheck(indexParamCheck())
if *containment {
log.Printf("\tindexing scheme: lshEnsemble (containment search)")
} else {
log.Printf("\tindexing scheme: lshForest")
}
log.Printf("\tprocessors: %d", *proc)
log.Printf("\tk-mer size: %d", *kSize)
log.Printf("\tsignature size: %d", *sigSize)
Expand Down Expand Up @@ -198,56 +205,57 @@ func runIndex() {
}()
///////////////////////////////////////////////////////////////////////////////////////
// collect and store the GrootGraph windows
var sigStore = make([]map[int]map[int][][]uint64, len(graphStore))
for i := range sigStore {
sigStore[i] = make(map[int]map[int][][]uint64)
}
sigStore := []*lshIndex.GraphWindow{}
lookupMap := make(lshIndex.KeyLookupMap)
// receive the signatures
var sigCount int = 0
for window := range windowChan {
// initialise the inner map of sigStore if graph has not been seen yet
if _, ok := sigStore[window.GraphID][window.Node]; !ok {
sigStore[window.GraphID][window.Node] = make(map[int][][]uint64)
}
// combine graphID, nodeID and offset to form a string key for signature
stringKey := fmt.Sprintf("g%dn%do%d", window.GraphID, window.Node, window.OffSet)
// convert to a graph window
gw := &lshIndex.GraphWindow{stringKey, *readLength, window.Sig}
// store the signature for the graph:node:offset
sigStore[window.GraphID][window.Node][window.OffSet] = append(sigStore[window.GraphID][window.Node][window.OffSet], window.Sig)
sigCount++
sigStore = append(sigStore, gw)
// add a key to the lookup map
lookupMap[stringKey] = seqio.Key{GraphID: window.GraphID, Node: window.Node, OffSet: window.OffSet}
}
log.Printf("\tnumber of signatures generated: %d\n", sigCount)
///////////////////////////////////////////////////////////////////////////////////////
// run LSH forest
log.Printf("running LSH forest...\n")
database := lshForest.NewLSHforest(*sigSize, *jsThresh)
// range over the nodes in each graph, each node will have one or more signature
for graphID, nodesMap := range sigStore {
// add each signature to the database
for nodeID, offsetMap := range nodesMap {
for offset, signatures := range offsetMap {
for _, signature := range signatures {
// combine graphID, nodeID and offset to form a string key for signature
stringKey := fmt.Sprintf("g%dn%do%d", graphID, nodeID, offset)
// add the key to a lookup map
key := seqio.Key{GraphID: graphID, Node: nodeID, OffSet: offset}
database.KeyLookup[stringKey] = key
// add the signature to the lshForest
database.Add(stringKey, signature)
}
}
numSigs := len(sigStore)
log.Printf("\tnumber of signatures generated: %d\n", numSigs)
var database *lshIndex.LshEnsemble
if *containment == false {
///////////////////////////////////////////////////////////////////////////////////////
// run LSH forest
log.Printf("running LSH Forest...\n")
database = lshIndex.NewLSHforest(*sigSize, *jsThresh)
// range over the nodes in each graph, each node will have one or more signature
for window := range lshIndex.Windows2Chan(sigStore) {
// add the signature to the lshForest
database.Add(window.Key, window.Signature, 0)
}
// print some stuff
numHF, numBucks := database.Lshes[0].Settings()
log.Printf("\tnumber of hash functions per bucket: %d\n", numHF)
log.Printf("\tnumber of buckets: %d\n", numBucks)
} else {
///////////////////////////////////////////////////////////////////////////////////////
// run LSH ensemble (https://github.com/ekzhu/lshensemble)
log.Printf("running LSH Ensemble...\n")
database = lshIndex.BootstrapLshEnsemble(lshIndex.PARTITIONS, *sigSize, lshIndex.MAXK, numSigs, lshIndex.Windows2Chan(sigStore))
// print some stuff
log.Printf("\tnumber of LSH Ensemble partitions: %d\n", lshIndex.PARTITIONS)
log.Printf("\tmax no. hash functions per bucket: %d\n", lshIndex.MAXK)
}
numHF, numBucks := database.Settings()
log.Printf("\tnumber of hash functions per bucket: %d\n", numHF)
log.Printf("\tnumber of buckets: %d\n", numBucks)
// attach the key lookup map to the index
database.KeyLookup = lookupMap
///////////////////////////////////////////////////////////////////////////////////////
// record runtime info
info := &misc.IndexInfo{Version: version.VERSION, Ksize: *kSize, SigSize: *sigSize, JSthresh: *jsThresh, ReadLength: *readLength}
info := &misc.IndexInfo{Version: version.VERSION, Ksize: *kSize, SigSize: *sigSize, JSthresh: *jsThresh, ReadLength: *readLength, Containment: *containment}
// save the index files
log.Printf("saving index files to \"%v\"...", *outDir)
misc.ErrorCheck(database.Dump(*outDir + "/index.sigs"))
log.Printf("\tsaved MinHash signatures")
misc.ErrorCheck(info.Dump(*outDir + "/index.info"))
log.Printf("\tsaved runtime info")
misc.ErrorCheck(graphStore.Dump(*outDir + "/index.graph"))
log.Printf("\tsaved groot graphs")
misc.ErrorCheck(database.Dump(*outDir + "/index.sigs"))
log.Printf("\tsaved MinHash signatures")
log.Println("finished")
}
2 changes: 1 addition & 1 deletion cmd/report.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func reportParamCheck() error {
log.Printf("\tBAM file: %v", *bamFile)
}
if *covCutoff > 1.0 {
return fmt.Errorf("supplied coverage cutoff exceeds 1.0 (100%): %v", *covCutoff)
return fmt.Errorf("supplied coverage cutoff exceeds 1.0 (100%%): %.2f", *covCutoff)
}
return nil
}
Expand Down
Empty file modified db/full-ARG-databases/resfinder/aminoglycoside.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/beta-lactam.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/colistin.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/fosfomycin.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/fusidicacid.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/glycopeptide.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/macrolide.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/nitroimidazole.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/oxazolidinone.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/phenicol.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/quinolone.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/rifampicin.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/sulphonamide.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/tetracycline.fsa
100755 → 100644
Empty file.
Empty file modified db/full-ARG-databases/resfinder/trimethoprim.fsa
100755 → 100644
Empty file.
30 changes: 18 additions & 12 deletions src/alignment/alignment_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package alignment

import (
"fmt"
"io"
"log"
"os"
"testing"

Expand All @@ -18,12 +18,12 @@ var (
sigSize = 128
)

func loadGFA() *gfa.GFA {
func loadGFA() (*gfa.GFA, error) {
// load the GFA file
fh, err := os.Open(inputFile)
reader, err := gfa.NewReader(fh)
if err != nil {
log.Fatal("can't read gfa file: %v", err)
return nil, fmt.Errorf("can't read gfa file: %v", err)
}
// collect the GFA instance
myGFA := reader.CollectGFA()
Expand All @@ -34,13 +34,13 @@ func loadGFA() *gfa.GFA {
break
}
if err != nil {
log.Fatal("error reading line in gfa file: %v", err)
return nil, fmt.Errorf("error reading line in gfa file: %v", err)
}
if err := line.Add(myGFA); err != nil {
log.Fatal("error adding line to GFA instance: %v", err)
return nil, fmt.Errorf("error adding line to GFA instance: %v", err)
}
}
return myGFA
return myGFA, nil
}

func setupMultimapRead() (*seqio.FASTQread, error) {
Expand Down Expand Up @@ -80,13 +80,16 @@ func TestExactMatchMultiMapper(t *testing.T) {
// create the read
testRead, err := setupMultimapRead()
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
// create the GrootGraph and graphStore
myGFA := loadGFA()
myGFA, err := loadGFA()
if err != nil {
t.Fatal(err)
}
grootGraph, err := graph.CreateGrootGraph(myGFA, 1)
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
graphStore := make(graph.GraphStore)
graphStore[grootGraph.GraphID] = grootGraph
Expand All @@ -113,13 +116,16 @@ func TestExactMatchUniqMapper(t *testing.T) {
// create the read
testRead, err := setupUniqmapRead()
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
// create the GrootGraph and graphStore
myGFA := loadGFA()
myGFA, err := loadGFA()
if err != nil {
t.Fatal(err)
}
grootGraph, err := graph.CreateGrootGraph(myGFA, 1)
if err != nil {
log.Fatal(err)
t.Fatal(err)
}
graphStore := make(graph.GraphStore)
graphStore[grootGraph.GraphID] = grootGraph
Expand Down
Loading

0 comments on commit 0805bd6

Please sign in to comment.