Skip to content

Commit

Permalink
Merge pull request #3103 from vgteam/parallel-gbwt-merging
Browse files Browse the repository at this point in the history
Parallel gbwt merging
  • Loading branch information
jltsiren authored Dec 4, 2020
2 parents 113cd91 + d737267 commit 0c49444
Show file tree
Hide file tree
Showing 8 changed files with 342 additions and 196 deletions.
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,6 @@ clean: clean-rocksdb clean-vcflib
$(RM) -r share/
cd $(DEP_DIR) && cd sonLib && $(MAKE) clean
cd $(DEP_DIR) && cd sparsehash && $(MAKE) clean
cd $(DEP_DIR) && cd htslib && $(MAKE) clean
cd $(DEP_DIR) && cd fastahack && $(MAKE) clean
cd $(DEP_DIR) && cd gcsa2 && $(MAKE) clean
cd $(DEP_DIR) && cd gbwt && $(MAKE) clean
Expand Down
2 changes: 1 addition & 1 deletion deps/gbwt
97 changes: 97 additions & 0 deletions src/gbwt_helper.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "gbwt_helper.hpp"
#include "utility.hpp"

#include <vg/io/vpkg.hpp>

#include <sstream>

namespace vg {
Expand Down Expand Up @@ -105,6 +107,101 @@ void finish_gbwt_constuction(gbwt::GBWTBuilder& builder,

//------------------------------------------------------------------------------

void load_gbwt(const std::string& filename, gbwt::GBWT& index, bool show_progress) {
if (show_progress) {
std::cerr << "Loading compressed GBWT from " << filename << std::endl;
}
std::unique_ptr<gbwt::GBWT> loaded = vg::io::VPKG::load_one<gbwt::GBWT>(filename);
if (loaded.get() == nullptr) {
std::cerr << "error: [load_gbwt()] could not load compressed GBWT " << filename << std::endl;
std::exit(EXIT_FAILURE);
}
index = std::move(*loaded);
}

void load_gbwt(const std::string& filename, gbwt::DynamicGBWT& index, bool show_progress) {
if (show_progress) {
std::cerr << "Loading dynamic GBWT from " << filename << std::endl;
}
std::unique_ptr<gbwt::DynamicGBWT> loaded = vg::io::VPKG::load_one<gbwt::DynamicGBWT>(filename);
if (loaded.get() == nullptr) {
std::cerr << "error: [load_gbwt()] could not load dynamic GBWT " << filename << std::endl;
std::exit(EXIT_FAILURE);
}
index = std::move(*loaded);
}

void GBWTHandler::use_compressed() {
if (this->in_use == index_compressed) {
return;
} else if (this->in_use == index_dynamic) {
if (this->show_progress) {
std::cerr << "Converting dynamic GBWT into compressed GBWT" << std::endl;
}
this->compressed = gbwt::GBWT(this->dynamic);
this->dynamic = gbwt::DynamicGBWT();
this->in_use = index_compressed;
} else {
load_gbwt(this->filename, this->compressed, this->show_progress);
this->in_use = index_compressed;
}
}

void GBWTHandler::use_dynamic() {
if (this->in_use == index_dynamic) {
return;
} else if (this->in_use == index_compressed) {
if (this->show_progress) {
std::cerr << "Converting compressed GBWT into dynamic GBWT" << std::endl;
}
this->dynamic = gbwt::DynamicGBWT(this->compressed);
this->compressed = gbwt::GBWT();
this->in_use = index_dynamic;
} else {
load_gbwt(this->filename, this->dynamic, this->show_progress);
this->in_use = index_dynamic;
}
}

void GBWTHandler::use(gbwt::GBWT& new_index) {
this->clear();
this->compressed.swap(new_index);
this->in_use = index_compressed;
}

void GBWTHandler::use(gbwt::DynamicGBWT& new_index) {
this->clear();
this->dynamic.swap(new_index);
this->in_use = index_dynamic;
}

void GBWTHandler::unbacked() {
this->filename = std::string();
}

void GBWTHandler::serialize(const std::string& new_filename) {
this->filename = new_filename;
if (this->show_progress) {
std::cerr << "Serializing the GBWT to " << this->filename << std::endl;
}
if (this->in_use == index_none) {
std::cerr << "warning: [GBWTHandler] no GBWT to serialize" << std::endl;
return;
} else if (this->in_use == index_compressed) {
vg::io::VPKG::save(this->compressed, this->filename);
} else {
vg::io::VPKG::save(this->dynamic, this->filename);
}
}

void GBWTHandler::clear() {
this->compressed = gbwt::GBWT();
this->dynamic = gbwt::DynamicGBWT();
this->in_use = index_none;
}

//------------------------------------------------------------------------------

std::string insert_gbwt_path(MutablePathHandleGraph& graph, const gbwt::GBWT& gbwt_index, gbwt::size_type id) {

gbwt::size_type sequence_id = gbwt::Path::encode(id, false);
Expand Down
54 changes: 54 additions & 0 deletions src/gbwt_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,60 @@ void finish_gbwt_constuction(gbwt::GBWTBuilder& builder,

//------------------------------------------------------------------------------

/// Load a compressed GBWT from the file.
void load_gbwt(const std::string& filename, gbwt::GBWT& index, bool show_progress = false);

/// Load a dynamic GBWT from the file.
void load_gbwt(const std::string& filename, gbwt::DynamicGBWT& index, bool show_progress = false);

/**
* Helper class that stores either a GBWT or a DynamicGBWT and loads them from a file
* or converts between them when necessary.
*/
struct GBWTHandler {
enum index_type { index_none, index_compressed, index_dynamic };

/// Compressed GBWT.
gbwt::GBWT compressed;

/// Dynamic GBWT.
gbwt::DynamicGBWT dynamic;

/// Which index is in use.
index_type in_use = index_none;

/// The in-memory indexes are backed by this file.
std::string filename;

/// Print progress information to stderr when loading/converting indexes.
bool show_progress = false;

/// Switch to a compressed GBWT, converting it from the dynamic GBWT or reading it
/// from a file if necessary.
void use_compressed();

/// Switch to a dynamic GBWT, converting it from the compressed GBWT or reading it
/// from a file if necessary.
void use_dynamic();

/// Start using this compressed GBWT. Clears the index used as the argument.
void use(gbwt::GBWT& new_index);

/// Start using this dynamic GBWT. Clears the index used as the argument.
void use(gbwt::DynamicGBWT& new_index);

/// The GBWT is no longer backed by a file.
void unbacked();

/// Serialize the in-memory index to this file and start using it as the backing file.
void serialize(const std::string& new_filename);

/// Clear the in-memory index.
void clear();
};

//------------------------------------------------------------------------------

/// Insert a GBWT thread into the graph and return its name. Returns an empty string on failure.
/// NOTE: id is a gbwt path id, not a gbwt sequence id.
std::string insert_gbwt_path(MutablePathHandleGraph& graph, const gbwt::GBWT& gbwt_index, gbwt::size_type id);
Expand Down
28 changes: 4 additions & 24 deletions src/haplotype_indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,7 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const std::vecto
return (this->excluded_samples.find(sample_names[sample_id]) == this->excluded_samples.end());
}, [&](const gbwt::Haplotype& haplotype) {
builder.insert(haplotype.path, true); // Insert in both orientations.
builder.index.metadata.addPath({
static_cast<gbwt::PathName::path_name_type>(haplotype.sample),
static_cast<gbwt::PathName::path_name_type>(contig_names.size() - 1),
static_cast<gbwt::PathName::path_name_type>(haplotype.phase),
static_cast<gbwt::PathName::path_name_type>(haplotype.count)
});
builder.index.metadata.addPath(haplotype.sample, contig_names.size() - 1, haplotype.phase, haplotype.count);
haplotypes.insert(gbwt::range_type(haplotype.sample, haplotype.phase));
}, [&](gbwt::size_type, gbwt::size_type) -> bool {
// For each overlap, discard it if our global flag is set.
Expand Down Expand Up @@ -399,21 +394,11 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
}
builder.insert(buffer, true); // Insert in both orientations.
if (this->paths_as_samples) {
builder.index.metadata.addPath({
static_cast<gbwt::PathName::path_name_type>(sample_names.size()),
static_cast<gbwt::PathName::path_name_type>(0),
static_cast<gbwt::PathName::path_name_type>(0),
static_cast<gbwt::PathName::path_name_type>(0)
});
builder.index.metadata.addPath(sample_names.size(), 0, 0, 0);
sample_names.push_back(path_name);
haplotype_count++;
} else {
builder.index.metadata.addPath({
static_cast<gbwt::PathName::path_name_type>(0),
static_cast<gbwt::PathName::path_name_type>(contig_names.size()),
static_cast<gbwt::PathName::path_name_type>(0),
static_cast<gbwt::PathName::path_name_type>(0)
});
builder.index.metadata.addPath(0, contig_names.size(), 0, 0);
contig_names.push_back(path_name);
}
});
Expand Down Expand Up @@ -463,12 +448,7 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
sample_count = iter->second.second;
iter->second.second++;
}
builder.index.metadata.addPath({
static_cast<gbwt::PathName::path_name_type>(sample_id),
static_cast<gbwt::PathName::path_name_type>(0),
static_cast<gbwt::PathName::path_name_type>(0),
static_cast<gbwt::PathName::path_name_type>(sample_count)
});
builder.index.metadata.addPath(sample_id, 0, 0, sample_count);
};
for (auto& file_name : aln_filenames) {
if (aln_format == "GAM") {
Expand Down
Loading

2 comments on commit 0c49444

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 15009 seconds

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for branch v1.29.0. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 14878 seconds

Please sign in to comment.