Skip to content

Commit

Permalink
Merge pull request #6 from santeripuranen/compress_output
Browse files Browse the repository at this point in the history
Compress output using gzip
  • Loading branch information
johnlees authored Feb 15, 2020
2 parents faadc2a + f7bcad0 commit 2a0d979
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 28 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ include (GatbCore)
# find Boost
set(BOOST_ROOT /usr/bin/include)
set(Boost_USE_STATIC_LIBS ON)
FIND_PACKAGE(Boost 1.69.0 COMPONENTS program_options filesystem system regex REQUIRED)
FIND_PACKAGE(Boost 1.69.0 COMPONENTS program_options filesystem system regex iostreams REQUIRED)
include_directories(${Boost_INCLUDE_DIR})

# cdbg-ops target
Expand Down Expand Up @@ -102,7 +102,7 @@ set (PROGRAM_SOURCE_DIR ${PROJECT_SOURCE_DIR}/src)
include_directories (${PROGRAM_SOURCE_DIR})
file (GLOB_RECURSE ProjectFiles ${PROGRAM_SOURCE_DIR}/*.cpp)
add_executable(${PROGRAM} ${ProjectFiles})
target_link_libraries(${PROGRAM} ${gatb-core-libraries} ${Boost_LIBRARIES} -static-libgcc -static-libstdc++)
target_link_libraries(${PROGRAM} ${gatb-core-libraries} ${Boost_LIBRARIES} -lz -static-libgcc -static-libstdc++)

################################################################################
# INSTALLATION
Expand Down
2 changes: 2 additions & 0 deletions src/global.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const char* STR_STRAINS_FILE = "-strains";
const char* STR_KSKMER_SIZE = "-k";
const char* STR_OUTPUT = "-output";
const char* STR_NBCORES = "-nb-cores";
const char* STR_GZIP = "-gzip";

//global vars used by both programs
Graph *graph;
Expand All @@ -47,4 +48,5 @@ void populateParser (Tool *tool) {
tool->getParser()->push_front (new OptionOneParam (STR_OUTPUT, "Path to the folder where the final and temporary files will be stored.", false, "output"));
tool->getParser()->push_front (new OptionOneParam (STR_KSKMER_SIZE, "K-mer size.", false, "31"));
tool->getParser()->push_front (new OptionOneParam (STR_STRAINS_FILE, "A text file describing the strains containing 2 columns: 1) ID of the strain; 2) Path to a multi-fasta file containing the sequences of the strain. This file needs a header.", true));
tool->getParser()->push_front (new OptionNoParam (STR_GZIP, "Compress unitig output using gzip.", false));
}
1 change: 1 addition & 0 deletions src/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ extern const char* STR_STRAINS_FILE;
extern const char* STR_KSKMER_SIZE;
extern const char* STR_OUTPUT;
extern const char* STR_NBCORES;
extern const char* STR_GZIP;

void populateParser (Tool *tool);

Expand Down
115 changes: 89 additions & 26 deletions src/map_reads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,24 @@
#include "map_reads.hpp"
#include "Utils.h"
#include <boost/algorithm/string/predicate.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/device/file.hpp>

#include <map>
#define NB_OF_READS_NOTIFICATION_MAP_AND_PHASE 10 //Nb of reads that the map and phase must process for notification
using namespace std;

namespace io = boost::iostreams;

void mapReadToTheGraphCore(const string &read, const Graph &graph, const vector< UnitigIdStrandPos > &nodeIdToUnitigId,
map<int,int> &unitigIdToCount) {
int lastUnitig=-1;

//goes through all nodes/kmers of the read
if (read.size() >= graph.getKmerSize()) {
for (int i = 0; i < read.size() - graph.getKmerSize() + 1; i++) {
for (std::size_t i = 0; i < read.size() - graph.getKmerSize() + 1; i++) {
string LRKmer = string(read.c_str() + i, graph.getKmerSize());

//TODO: From my tests, if you use buildNode with a Kmer containing an 'N', it will build a node with all Ns replaced by G
Expand Down Expand Up @@ -76,13 +82,6 @@ void mapReadToTheGraphCore(const string &read, const Graph &graph, const vector<
}
}

void mapReadToTheGraph(const string &read, int readfileIndex, unsigned long readIndex, const Graph &graph,
const vector< UnitigIdStrandPos > &nodeIdToUnitigId, map<int,int> &unitigIdToCount) {
//map the read
mapReadToTheGraphCore(read, graph, nodeIdToUnitigId, unitigIdToCount);
}


// We define a functor that will be cloned by the dispatcher
struct MapAndPhase
{
Expand Down Expand Up @@ -140,7 +139,7 @@ struct MapAndPhase
for (it.first(); !it.isDone(); it.next()) {
string read = (it.item()).toString();
//transform the read to upper case
for (int j=0;j<read.size();j++)
for (std::size_t j=0;j<read.size();j++)
read[j]=toupper(read[j]);

//map this read to the graph
Expand All @@ -166,6 +165,34 @@ map_reads::map_reads () : Tool ("map_reads") //give a name to our tool
populateParser(this);
}

std::vector< boost::dynamic_bitset<> >
transposeXU( std::vector< boost::dynamic_bitset<> > &XUT )
{
using bitmap_t = boost::dynamic_bitset<>;
using bitmap_container_t = std::vector< bitmap_t >;

const std::size_t nsamples( XUT.size() );
const std::size_t ncontigs = XUT[0].size();
bitmap_container_t XU; XU.resize(ncontigs);
for( auto& row: XU ) { row.resize(nsamples); }

// transpose the input matrix; this is a fairly expensive operation
// due to the horrible memory access patterns involved here
for( std::size_t i=0; i<XUT.size(); ++i )
{
auto& bitset = XUT[i];
auto pos = bitset.find_first();
while( pos != bitmap_t::npos )
{
XU[pos].set(i);
pos = bitset.find_next(pos);
}
// release memory
bitset.resize(0); bitmap_t(bitset).swap(bitset);
}
return XU;
}

//pattern is the unitig line
map< vector<int>, vector<int> > getUnitigsWithSamePattern (const vector< vector<int> > &XU, int nbContigs) {
map< vector<int>, vector<int> > pattern2Unitigs;
Expand All @@ -187,9 +214,28 @@ map< vector<int>, vector<int> > getUnitigsWithSamePattern (const vector< vector<
return pattern2Unitigs;
}

void generate_XU(const string &filename, const string &nodesFile, const vector< vector<int> > &XU) {
ofstream XUFile;
openFileForWriting(filename, XUFile);
bool init_sink( const std::string& filename, io::filtering_ostream& os, bool compress=false )
{
if( compress ) {
os.push( io::gzip_compressor() );
os.push( io::file_sink(filename+".gz",std::ios::binary) );
}
else {
os.push( io::file_sink(filename) );
}

return os.is_complete();
}

void generate_XU(const string &filename, const string &nodesFile, const vector< boost::dynamic_bitset<> > &XU, bool compress=false ) {
using bitmap_t = boost::dynamic_bitset<>;
//ofstream XUFile;
//openFileForWriting(filename, XUFile);
io::filtering_ostream XUFile;
if( !init_sink( filename, XUFile, compress ) ) {
cerr << "Unknown error when trying to open file \"" << filename << "\" for output!" << endl;
return;
}

//open file with list of node sequences
ifstream nodesFileReader;
Expand All @@ -210,7 +256,7 @@ void generate_XU(const string &filename, const string &nodesFile, const vector<
XUFile << endl;
}
nodesFileReader.close();
XUFile.close();
//XUFile.close(); // filtering_ostream's destructor does this for us
}

void generate_unique_id_to_original_ids(const string &filename,
Expand All @@ -234,10 +280,15 @@ void generate_unique_id_to_original_ids(const string &filename,
uniqueIdToOriginalIdsFile.close();
}

void generate_XU_unique(const string &filename, const vector< vector<int> > &XU,
const map< vector<int>, vector<int> > &pattern2Unitigs){
ofstream XUUnique;
openFileForWriting(filename, XUUnique);
void generate_XU_unique(const string &filename, const vector< boost::dynamic_bitset<> > &XU,
const map< boost::dynamic_bitset<>, vector<int> > &pattern2Unitigs, bool compress=false ){
//ofstream XUUnique;
//openFileForWriting(filename, XUUnique);
io::filtering_ostream XUUnique;
if( !init_sink( filename, XUUnique, compress ) ) {
cerr << "Unknown error when trying to open file \"" << filename << "\" for output!" << endl;
return;
}

//print the header
XUUnique << "pattern_id";
Expand All @@ -257,13 +308,14 @@ void generate_XU_unique(const string &filename, const vector< vector<int> > &XU,
XUUnique << " " << v;
XUUnique << endl;
}
XUUnique.close();
//XUUnique.close(); // filtering_ostream's destructor does this for us
}

//generate the pyseer input
void generatePyseerInput (const vector <string> &allReadFilesNames,
const string &outputFolder, const string &tmpFolder,
int nbContigs) {
const string &outputFolder,
const vector< boost::dynamic_bitset<> >& XU,
int nbContigs, bool compress=false ) {
//Generate the XU (the pyseer input - the unitigs are rows with strains present)
//XU_unique is XU is in matrix form (for Rtab input) with the duplicated rows removed
cerr << endl << endl << "[Generating pyseer input]..." << endl;
Expand All @@ -284,10 +336,11 @@ void generatePyseerInput (const vector <string> &allReadFilesNames,

//create the files for pyseer
{
generate_XU(outputFolder+string("/unitigs.txt"), outputFolder+string("/graph.nodes"), XU);
map< vector<int>, vector<int> > pattern2Unitigs = getUnitigsWithSamePattern(XU, nbContigs);
generate_XU(outputFolder+string("/unitigs.txt"), outputFolder+string("/graph.nodes"), XU, compress );
auto pattern2Unitigs = getUnitigsWithSamePattern(XU);
cout << "Number of unique patterns: " << pattern2Unitigs.size() << endl;
generate_unique_id_to_original_ids(outputFolder+string("/unitigs.unique_rows_to_all_rows.txt"), pattern2Unitigs);
generate_XU_unique(outputFolder+string("/unitigs.unique_rows.Rtab"), XU, pattern2Unitigs);
generate_XU_unique(outputFolder+string("/unitigs.unique_rows.Rtab"), XU, pattern2Unitigs, compress );
}

cerr << "[Generating pyseer input] - Done!" << endl;
Expand All @@ -302,6 +355,7 @@ void map_reads::execute ()
string tmpFolder = outputFolder+string("/tmp");
string longReadsFile = tmpFolder+string("/readsFile");
int nbCores = getInput()->getInt(STR_NBCORES);
const bool compress = getInput()->get(STR_GZIP);

//get the nbContigs
int nbContigs = getNbLinesInFile(outputFolder+string("/graph.nodes"));
Expand Down Expand Up @@ -330,12 +384,21 @@ void map_reads::execute ()
MapAndPhase(allReadFilesNames, *graph, outputFolder, tmpFolder, nbOfReadsProcessed, synchro,
*nodeIdToUnitigId, nbContigs));

cerr << endl << "[Mapping process finished!]" << endl;
// allUnitigPatterns has all samples/strains over the first dimension and
// unitig presense patterns over the second dimension (in bitsets).
// Here we transpose the matrix, while consuming it in order to gradually reduce memory footprint.
// For larger data sets this pattern accounting will dominate our memory footprint; overall memory consumption will peak here.
// Peak memory use occurs at the start and will be twice the matrix size (= 2 * (nbContigs*strains->size()/8) bytes).
cout << "[Transpose pattern matrix..]" << endl;
auto XU = transposeXU( allUnitigPatterns ); // this will consume allUnitigPatterns while transposing
allUnitigPatterns.resize(0); bitmap_container_t(allUnitigPatterns).swap(allUnitigPatterns); // release memory

//generate the pyseer input
generatePyseerInput(allReadFilesNames, outputFolder, tmpFolder, nbContigs);
cout << "[Generating pyseer input]..." << endl;
generatePyseerInput(allReadFilesNames, outputFolder, XU, nbContigs, compress);
cout << "[Generating pyseer input] - Done!" << endl;

cout << "Number of unique patterns: " << getNbLinesInFile(outputFolder+string("/unitigs.unique_rows.Rtab")) << endl;
//cout << "Number of unique patterns: " << getNbLinesInFile(outputFolder+string("/unitigs.unique_rows.Rtab")) << endl;

// Remove the global graph pointer, otherwise its destructor is called
// twice by GATB (after main) giving a HDF5 error
Expand Down

0 comments on commit 2a0d979

Please sign in to comment.