Skip to content

Commit

Permalink
Add command line options for overlap phase
Browse files Browse the repository at this point in the history
  • Loading branch information
rvaser committed Aug 24, 2021
1 parent 2cdc223 commit 9254fc4
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 54 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.11)

project(raven VERSION 1.5.3
project(raven VERSION 1.6.0
LANGUAGES CXX
DESCRIPTION "Raven is a de novo genome assembler for long uncorrected reads.")

Expand Down
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,15 @@ usage: raven [options ...] <sequences> [<sequences> ...]
input file in FASTA/FASTQ format (can be compressed with gzip)

options:
--weaken
use larger (k, w) when assembling highly accurate sequences
-k, --kmer-len <int>
default: 15
length of minimizers used to find overlaps
-w, --window-len <int>
default: 5
length of sliding window from which minimizers are sampled
-f, --frequency <double>
default: 0.001
threshold for ignoring most frequent minimizers
-p, --polishing-rounds <int>
default: 2
number of times racon is invoked
Expand Down
21 changes: 9 additions & 12 deletions src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,20 +79,22 @@ std::atomic<std::uint32_t> Graph::Node::num_objects{0};
std::atomic<std::uint32_t> Graph::Edge::num_objects{0};

Graph::Graph(
bool weaken,
bool checkpoints,
std::shared_ptr<thread_pool::ThreadPool> thread_pool)
: thread_pool_(thread_pool ?
thread_pool :
std::make_shared<thread_pool::ThreadPool>(1)),
stage_(-5),
checkpoints_(checkpoints),
accurate_(weaken),
piles_(),
nodes_(),
edges_() {}

void Graph::Construct(std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequences) { // NOLINT
void Graph::Construct(
std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequences, // NOLINT
std::uint8_t kmer_len,
std::uint8_t window_len,
double freq) {
if (sequences.empty() || stage_ > -4) {
return;
}
Expand Down Expand Up @@ -273,12 +275,7 @@ void Graph::Construct(std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequen

biosoup::Timer timer{};

std::uint32_t kmer_len = accurate_ ? 29 : 15;
ram::MinimizerEngine minimizer_engine{
thread_pool_,
kmer_len,
accurate_ ? 9U : 5U
};
ram::MinimizerEngine minimizer_engine{thread_pool_, kmer_len, window_len};

if (stage_ == -5) { // find overlaps and create piles
for (const auto& it : sequences) {
Expand All @@ -298,7 +295,7 @@ void Graph::Construct(std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequen
sequences.begin() + j,
sequences.begin() + i + 1,
true);
minimizer_engine.Filter(0.001);
minimizer_engine.Filter(freq);

std::cerr << "[raven::Graph::Construct] minimized "
<< j << " - " << i + 1 << " / " << sequences.size() << " "
Expand Down Expand Up @@ -562,7 +559,7 @@ void Graph::Construct(std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequen
timer.Start();

std::vector<std::future<std::vector<biosoup::Overlap>>> thread_futures;
minimizer_engine.Filter(0.001);
minimizer_engine.Filter(freq);
for (std::uint32_t k = 0; k < i + 1; ++k) {
thread_futures.emplace_back(thread_pool_->Submit(
[&] (std::uint32_t i) -> std::vector<biosoup::Overlap> {
Expand Down Expand Up @@ -1058,7 +1055,7 @@ std::uint32_t Graph::RemoveBubbles() {
static_cast<double>(std::max(l.size(), r.size()));
edlibFreeAlignResult(result);
}
if (score < 0.5) {
if (score < 0.8) {
return std::unordered_set<std::uint32_t>{};
}
}
Expand Down
44 changes: 29 additions & 15 deletions src/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ namespace raven {
class Graph {
public:
Graph(
bool weaken,
bool checkpoints,
std::shared_ptr<thread_pool::ThreadPool> thread_pool = nullptr);

Expand All @@ -59,14 +58,25 @@ class Graph {
return stage_;
}

// break chimeric sequences, remove contained sequences and overlaps not
// spanning bridged repeats at sequence ends
void Construct(std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequences); // NOLINT

// simplify with transitive reduction, tip prunning and bubble popping
// Overlap phase
// - split chimeric sequences
// - remove contained sequences
// - remove overlaps not spanning bridged repeats at sequence ends
void Construct(
std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequences, // NOLINT
std::uint8_t kmer_len = 15,
std::uint8_t window_len = 5,
double freq = 0.001);

// Layout phase
// - remove transitive edges
// - remove tips
// - remove bubbles
// - remove elongated edges in 2D layout
void Assemble();

// Racon wrapper
// Consensus phase
// - utilize Racon
void Polish(
const std::vector<std::unique_ptr<biosoup::NucleicAcid>>& sequences,
std::uint8_t match,
Expand All @@ -77,19 +87,20 @@ class Graph {
std::uint32_t cuda_alignment_batches,
std::uint32_t num_rounds);

// ignore nodes that are less than epsilon away from any junction node
// shrink graph by joining paths without branches into a single node
// - ignore nodes less than epsilon away from a node with outdegree > 1
std::uint32_t CreateUnitigs(std::uint32_t epsilon = 0);

std::vector<std::unique_ptr<biosoup::NucleicAcid>> GetUnitigs(
bool drop_unpolished = false);

// draw with misc/plotter.py
// draw pile-o-grams with misc/plotter.py
void PrintJson(const std::string& path) const;

// draw with Cytoscape
// draw graph with Cytoscape
void PrintCsv(const std::string& path) const;

// draw with Bandage
// draw graph with Bandage
void PrintGfa(const std::string& path) const;

// cereal load wrapper
Expand All @@ -102,14 +113,18 @@ class Graph {
// inspired by (Myers 1995) & (Myers 2005)
std::uint32_t RemoveTransitiveEdges();

// searh from a node with inedegree = 0 until a node with outdegree > 1
std::uint32_t RemoveTips();

// BFS from a node with outdegree > 1 to first joint node
// - check path similarity with edlib
// - remove edges which do not remove reachability of any two nodes
std::uint32_t RemoveBubbles();

// remove long edges in force directed layout
// remove elongated edges in force directed layout
std::uint32_t RemoveLongEdges(std::uint32_t num_round);

// label small circular contigs as unitigs
// label small circular contigs as unitigs if they are unique
std::uint32_t SalvagePlasmids();

friend cereal::access;
Expand Down Expand Up @@ -265,14 +280,13 @@ class Graph {
bool remove_nodes = false);

// use (Fruchterman & Reingold 1991) with (Barnes & Hut 1986) approximation
// (draw with misc/plotter.py)
// - draw with misc/plotter.py
void CreateForceDirectedLayout(const std::string& path = "");

std::shared_ptr<thread_pool::ThreadPool> thread_pool_;

int stage_;
bool checkpoints_;
bool accurate_;
std::vector<std::unique_ptr<Pile>> piles_;
std::vector<std::shared_ptr<Node>> nodes_;
std::vector<std::shared_ptr<Edge>> edges_;
Expand Down
49 changes: 31 additions & 18 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ std::atomic<std::uint32_t> biosoup::NucleicAcid::num_objects{0};
namespace {

static struct option options[] = {
{"weaken", no_argument, nullptr, 'w'},
{"kmer-len", required_argument, nullptr, 'k'},
{"window-len", required_argument, nullptr, 'w'},
{"frequency", required_argument, nullptr, 'f'},
{"polishing-rounds", required_argument, nullptr, 'p'},
{"match", required_argument, nullptr, 'm'},
{"mismatch", required_argument, nullptr, 'n'},
Expand All @@ -25,7 +27,7 @@ static struct option options[] = {
{"cuda-banded-alignment", no_argument, nullptr, 'b'},
{"cuda-alignment-batches", required_argument, nullptr, 'a'},
#endif
{"graphical-fragment-assembly", required_argument, nullptr, 'f'},
{"graphical-fragment-assembly", required_argument, nullptr, 'F'},
{"resume", no_argument, nullptr, 'r'},
{"disable-checkpoints", no_argument, nullptr, 'd'},
{"threads", required_argument, nullptr, 't'},
Expand Down Expand Up @@ -76,8 +78,15 @@ void Help() {
" input file in FASTA/FASTQ format (can be compressed with gzip)\n"
"\n"
" options:\n"
" --weaken\n"
" use larger (k, w) when assembling highly accurate sequences\n"
" -k, --kmer-len <int>\n"
" default: 15\n"
" length of minimizers used to find overlaps\n"
" -w, --window-len <int>\n"
" default: 5\n"
" length of sliding window from which minimizers are sampled\n"
" -f, --frequency <double>\n"
" default: 0.001\n"
" threshold for ignoring most frequent minimizers\n"
" -p, --polishing-rounds <int>\n"
" default: 2\n"
" number of times racon is invoked\n"
Expand Down Expand Up @@ -119,7 +128,9 @@ void Help() {
} // namespace

int main(int argc, char** argv) {
bool weaken = false;
std::uint8_t kmer_len = 15;
std::uint8_t window_len = 5;
double freq = 0.001;

std::int32_t num_polishing_rounds = 2;
std::int8_t m = 3;
Expand All @@ -136,41 +147,43 @@ int main(int argc, char** argv) {
std::uint32_t cuda_alignment_batches = 0;
bool cuda_banded_alignment = false;

std::string optstr = "p:m:n:g:t:h";
std::string optstr = "k:w:f:p:m:n:g:t:h";
#ifdef CUDA_ENABLED
optstr += "c:ba:";
#endif
int arg;
while ((arg = getopt_long(argc, argv, optstr.c_str(), options, nullptr)) != -1) { // NOLINT
switch (arg) {
case 'w': weaken = true; break;
case 'p': num_polishing_rounds = atoi(optarg); break;
case 'm': m = atoi(optarg); break;
case 'n': n = atoi(optarg); break;
case 'g': g = atoi(optarg); break;
case 'k': kmer_len = std::atoi(optarg); break;
case 'w': window_len = std::atoi(optarg); break;
case 'f': freq = std::atof(optarg); break;
case 'p': num_polishing_rounds = std::atoi(optarg); break;
case 'm': m = std::atoi(optarg); break;
case 'n': n = std::atoi(optarg); break;
case 'g': g = std::atoi(optarg); break;
#ifdef CUDA_ENABLED
case 'c':
cuda_poa_batches = 1;
// next text entry is not an option, assuming it's the arg for option c
if (optarg == NULL && argv[optind] != NULL && argv[optind][0] != '-') {
cuda_poa_batches = atoi(argv[optind++]);
cuda_poa_batches = std::atoi(argv[optind++]);
}
// optional argument provided in the ususal way
if (optarg != NULL) {
cuda_poa_batches = atoi(optarg);
cuda_poa_batches = std::atoi(optarg);
}
break;
case 'b':
cuda_banded_alignment = true;
break;
case 'a':
cuda_alignment_batches = atoi(optarg);
cuda_alignment_batches = std::atoi(optarg);
break;
#endif
case 'f': gfa_path = optarg; break;
case 'F': gfa_path = optarg; break;
case 'r': resume = true; break;
case 'd': checkpoints = false; break;
case 't': num_threads = atoi(optarg); break;
case 't': num_threads = std::atoi(optarg); break;
case 'v': std::cout << VERSION << std::endl; return 0;
case 'h': Help(); return 0;
default: return 1;
Expand All @@ -192,7 +205,7 @@ int main(int argc, char** argv) {

auto thread_pool = std::make_shared<thread_pool::ThreadPool>(num_threads);

raven::Graph graph{weaken, checkpoints, thread_pool};
raven::Graph graph{checkpoints, thread_pool};
if (resume) {
try {
graph.Load();
Expand Down Expand Up @@ -253,7 +266,7 @@ int main(int argc, char** argv) {
timer.Start();
}

graph.Construct(sequences);
graph.Construct(sequences, kmer_len, window_len, freq);
graph.Assemble();
graph.Polish(sequences, m, n, g, cuda_poa_batches, cuda_banded_alignment,
cuda_alignment_batches, num_polishing_rounds);
Expand Down
12 changes: 6 additions & 6 deletions test/raven_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class RavenTest: public ::testing::Test {
};

TEST_F(RavenTest, Assemble) {
Graph g{false, false, nullptr};
Graph g{false, nullptr};
g.Construct(s);
g.Assemble();
g.Polish(s, 3, -5, -4, 0, false, 0, 2);
Expand All @@ -53,26 +53,26 @@ TEST_F(RavenTest, Assemble) {
}

TEST_F(RavenTest, Checkpoints) {
Graph g{false, false, nullptr};
Graph g{false, nullptr};
g.Construct(s);
g.Assemble();
g.Polish(s, 2, -5, -2, 0, false, 0, 2);
auto u = std::move(g.GetUnitigs(true).front());

SetUp();

g = {false, true, nullptr};
g = {true, nullptr};
g.Construct(s);

g = {false, true, nullptr};
g = {true, nullptr};
g.Load();
g.Assemble();

g = {false, true, nullptr};
g = {true, nullptr};
g.Load();
g.Polish(s, 2, -5, -2, 0, false, 0, 1);

g = {false, true, nullptr};
g = {true, nullptr};
g.Load();
g.Polish(s, 2, -5, -2, 0, false, 0, 2);

Expand Down

0 comments on commit 9254fc4

Please sign in to comment.