Merge pull request #6 from santeripuranen/compress_output

Compress output using gzip
bacpop · Feb 15, 2020 · 2a0d979 · 2a0d979
2 parents faadc2a + f7bcad0
commit 2a0d979
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 28 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,7 +68,7 @@ include (GatbCore)
 # find Boost
 set(BOOST_ROOT /usr/bin/include)
 set(Boost_USE_STATIC_LIBS   ON)
-FIND_PACKAGE(Boost 1.69.0 COMPONENTS program_options filesystem system regex REQUIRED)
+FIND_PACKAGE(Boost 1.69.0 COMPONENTS program_options filesystem system regex iostreams REQUIRED)
 include_directories(${Boost_INCLUDE_DIR})
 
 # cdbg-ops target
@@ -102,7 +102,7 @@ set (PROGRAM_SOURCE_DIR ${PROJECT_SOURCE_DIR}/src)
 include_directories (${PROGRAM_SOURCE_DIR})
 file (GLOB_RECURSE  ProjectFiles  ${PROGRAM_SOURCE_DIR}/*.cpp)
 add_executable(${PROGRAM} ${ProjectFiles})
-target_link_libraries(${PROGRAM} ${gatb-core-libraries} ${Boost_LIBRARIES} -static-libgcc -static-libstdc++)
+target_link_libraries(${PROGRAM} ${gatb-core-libraries} ${Boost_LIBRARIES} -lz -static-libgcc -static-libstdc++)
 
 ################################################################################
 #  INSTALLATION

diff --git a/src/global.cpp b/src/global.cpp
@@ -36,6 +36,7 @@ const char* STR_STRAINS_FILE = "-strains";
 const char* STR_KSKMER_SIZE = "-k";
 const char* STR_OUTPUT = "-output";
 const char* STR_NBCORES = "-nb-cores";
+const char* STR_GZIP = "-gzip";
 
 //global vars used by both programs
 Graph *graph;
@@ -47,4 +48,5 @@ void populateParser (Tool *tool) {
   tool->getParser()->push_front (new OptionOneParam (STR_OUTPUT, "Path to the folder where the final and temporary files will be stored.",  false, "output"));
   tool->getParser()->push_front (new OptionOneParam (STR_KSKMER_SIZE, "K-mer size.",  false, "31"));
   tool->getParser()->push_front (new OptionOneParam (STR_STRAINS_FILE, "A text file describing the strains containing 2 columns: 1) ID of the strain; 2) Path to a multi-fasta file containing the sequences of the strain. This file needs a header.",  true));
+  tool->getParser()->push_front (new OptionNoParam (STR_GZIP, "Compress unitig output using gzip.", false));
 }
diff --git a/src/global.h b/src/global.h
@@ -40,6 +40,7 @@ extern const char* STR_STRAINS_FILE;
 extern const char* STR_KSKMER_SIZE;
 extern const char* STR_OUTPUT;
 extern const char* STR_NBCORES;
+extern const char* STR_GZIP;
 
 void populateParser (Tool *tool);
 

diff --git a/src/map_reads.cpp b/src/map_reads.cpp
@@ -31,18 +31,24 @@
 #include "map_reads.hpp"
 #include "Utils.h"
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/device/file.hpp>
+
 #include <map>
 #define NB_OF_READS_NOTIFICATION_MAP_AND_PHASE 10 //Nb of reads that the map and phase must process for notification
 using namespace std;
 
+namespace io = boost::iostreams;
 
 void mapReadToTheGraphCore(const string &read, const Graph &graph, const vector< UnitigIdStrandPos > &nodeIdToUnitigId,
                            map<int,int> &unitigIdToCount) {
     int lastUnitig=-1;
 
     //goes through all nodes/kmers of the read
     if (read.size() >= graph.getKmerSize()) {
-        for (int i = 0; i < read.size() - graph.getKmerSize() + 1; i++) {
+        for (std::size_t i = 0; i < read.size() - graph.getKmerSize() + 1; i++) {
             string LRKmer = string(read.c_str() + i, graph.getKmerSize());
 
             //TODO: From my tests, if you use buildNode with a Kmer containing an 'N', it will build a node with all Ns replaced by G
@@ -76,13 +82,6 @@ void mapReadToTheGraphCore(const string &read, const Graph &graph, const vector<
     }
 }
 
-void mapReadToTheGraph(const string &read, int readfileIndex, unsigned long readIndex, const Graph &graph,
-                       const vector< UnitigIdStrandPos > &nodeIdToUnitigId, map<int,int> &unitigIdToCount) {
-    //map the read
-    mapReadToTheGraphCore(read, graph, nodeIdToUnitigId, unitigIdToCount);
-}
-
-
 // We define a functor that will be cloned by the dispatcher
 struct MapAndPhase
 {
@@ -140,7 +139,7 @@ struct MapAndPhase
         for (it.first(); !it.isDone(); it.next()) {
             string read = (it.item()).toString();
             //transform the read to upper case
-            for (int j=0;j<read.size();j++)
+            for (std::size_t j=0;j<read.size();j++)
                 read[j]=toupper(read[j]);
 
             //map this read to the graph
@@ -166,6 +165,34 @@ map_reads::map_reads ()  : Tool ("map_reads") //give a name to our tool
     populateParser(this);
 }
 
+std::vector< boost::dynamic_bitset<> >
+transposeXU( std::vector< boost::dynamic_bitset<> > &XUT )
+{
+	using bitmap_t = boost::dynamic_bitset<>;
+	using bitmap_container_t = std::vector< bitmap_t >;
+
+	const std::size_t nsamples( XUT.size() );
+	const std::size_t ncontigs = XUT[0].size();
+	bitmap_container_t XU; XU.resize(ncontigs);
+	for( auto& row: XU ) { row.resize(nsamples); }
+
+	// transpose the input matrix; this is a fairly expensive operation
+	// due to the horrible memory access patterns involved here
+	for( std::size_t i=0; i<XUT.size(); ++i )
+	{
+		auto& bitset = XUT[i];
+        auto pos = bitset.find_first();
+        while( pos != bitmap_t::npos )
+        {
+        	XU[pos].set(i);
+        	pos = bitset.find_next(pos);
+        }
+        // release memory
+        bitset.resize(0); bitmap_t(bitset).swap(bitset);
+	}
+	return XU;
+}
+
 //pattern is the unitig line
 map< vector<int>, vector<int> > getUnitigsWithSamePattern (const vector< vector<int> > &XU, int nbContigs) {
     map< vector<int>, vector<int> > pattern2Unitigs;
@@ -187,9 +214,28 @@ map< vector<int>, vector<int> > getUnitigsWithSamePattern (const vector< vector<
     return pattern2Unitigs;
 }
 
-void generate_XU(const string &filename, const string &nodesFile, const vector< vector<int> > &XU) {
-    ofstream XUFile;
-    openFileForWriting(filename, XUFile);
+bool init_sink( const std::string& filename, io::filtering_ostream& os, bool compress=false )
+{
+	if( compress ) {
+		os.push( io::gzip_compressor() );
+		os.push( io::file_sink(filename+".gz",std::ios::binary) );
+	}
+	else {
+		os.push( io::file_sink(filename) );
+	}
+
+	return os.is_complete();
+}
+
+void generate_XU(const string &filename, const string &nodesFile, const vector< boost::dynamic_bitset<> > &XU, bool compress=false ) {
+	using bitmap_t = boost::dynamic_bitset<>;
+	//ofstream XUFile;
+    //openFileForWriting(filename, XUFile);
+	io::filtering_ostream XUFile;
+	if( !init_sink( filename, XUFile, compress ) ) {
+		cerr << "Unknown error when trying to open file \"" << filename << "\" for output!" << endl;
+		return;
+	}
 
     //open file with list of node sequences
     ifstream nodesFileReader;
@@ -210,7 +256,7 @@ void generate_XU(const string &filename, const string &nodesFile, const vector<
         XUFile << endl;
     }
     nodesFileReader.close();
-    XUFile.close();
+    //XUFile.close(); // filtering_ostream's destructor does this for us
 }
 
 void generate_unique_id_to_original_ids(const string &filename,
@@ -234,10 +280,15 @@ void generate_unique_id_to_original_ids(const string &filename,
     uniqueIdToOriginalIdsFile.close();
 }
 
-void generate_XU_unique(const string &filename, const vector< vector<int> > &XU,
-                        const map< vector<int>, vector<int> > &pattern2Unitigs){
-    ofstream XUUnique;
-    openFileForWriting(filename, XUUnique);
+void generate_XU_unique(const string &filename, const vector< boost::dynamic_bitset<> > &XU,
+                        const map< boost::dynamic_bitset<>, vector<int> > &pattern2Unitigs, bool compress=false ){
+    //ofstream XUUnique;
+    //openFileForWriting(filename, XUUnique);
+	io::filtering_ostream XUUnique;
+	if( !init_sink( filename, XUUnique, compress ) ) {
+		cerr << "Unknown error when trying to open file \"" << filename << "\" for output!" << endl;
+		return;
+	}
 
     //print the header
     XUUnique << "pattern_id";
@@ -257,13 +308,14 @@ void generate_XU_unique(const string &filename, const vector< vector<int> > &XU,
             XUUnique << " " << v;
         XUUnique << endl;
     }
-    XUUnique.close();
+    //XUUnique.close(); // filtering_ostream's destructor does this for us
 }
 
 //generate the pyseer input
 void generatePyseerInput (const vector <string> &allReadFilesNames,
-                          const string &outputFolder, const string &tmpFolder,
-                          int nbContigs) {
+                          const string &outputFolder,
+						  const vector< boost::dynamic_bitset<> >& XU,
+                          int nbContigs, bool compress=false ) {
     //Generate the XU (the pyseer input - the unitigs are rows with strains present)
     //XU_unique is XU is in matrix form (for Rtab input) with the duplicated rows removed
     cerr << endl << endl << "[Generating pyseer input]..." << endl;
@@ -284,10 +336,11 @@ void generatePyseerInput (const vector <string> &allReadFilesNames,
 
     //create the files for pyseer
     {
-        generate_XU(outputFolder+string("/unitigs.txt"), outputFolder+string("/graph.nodes"), XU);
-        map< vector<int>, vector<int> > pattern2Unitigs = getUnitigsWithSamePattern(XU, nbContigs);
+        generate_XU(outputFolder+string("/unitigs.txt"), outputFolder+string("/graph.nodes"), XU, compress );
+        auto pattern2Unitigs = getUnitigsWithSamePattern(XU);
+        cout << "Number of unique patterns: " << pattern2Unitigs.size() << endl;
         generate_unique_id_to_original_ids(outputFolder+string("/unitigs.unique_rows_to_all_rows.txt"), pattern2Unitigs);
-        generate_XU_unique(outputFolder+string("/unitigs.unique_rows.Rtab"), XU, pattern2Unitigs);
+        generate_XU_unique(outputFolder+string("/unitigs.unique_rows.Rtab"), XU, pattern2Unitigs, compress );
     }
 
     cerr << "[Generating pyseer input] - Done!" << endl;
@@ -302,6 +355,7 @@ void map_reads::execute ()
     string tmpFolder = outputFolder+string("/tmp");
     string longReadsFile = tmpFolder+string("/readsFile");
     int nbCores = getInput()->getInt(STR_NBCORES);
+    const bool compress = getInput()->get(STR_GZIP);
 
     //get the nbContigs
     int nbContigs = getNbLinesInFile(outputFolder+string("/graph.nodes"));
@@ -330,12 +384,21 @@ void map_reads::execute ()
                        MapAndPhase(allReadFilesNames, *graph, outputFolder, tmpFolder, nbOfReadsProcessed, synchro,
                                    *nodeIdToUnitigId, nbContigs));
 
-    cerr << endl << "[Mapping process finished!]" << endl;
+    // allUnitigPatterns has all samples/strains over the first dimension and
+    // unitig presense patterns over the second dimension (in bitsets).
+    // Here we transpose the matrix, while consuming it in order to gradually reduce memory footprint.
+    // For larger data sets this pattern accounting will dominate our memory footprint; overall memory consumption will peak here.
+    // Peak memory use occurs at the start and will be twice the matrix size (= 2 * (nbContigs*strains->size()/8) bytes).
+    cout << "[Transpose pattern matrix..]" << endl;
+    auto XU = transposeXU( allUnitigPatterns ); // this will consume allUnitigPatterns while transposing
+    allUnitigPatterns.resize(0); bitmap_container_t(allUnitigPatterns).swap(allUnitigPatterns); // release memory
 
     //generate the pyseer input
-    generatePyseerInput(allReadFilesNames, outputFolder, tmpFolder, nbContigs);
+    cout << "[Generating pyseer input]..." << endl;
+    generatePyseerInput(allReadFilesNames, outputFolder, XU, nbContigs, compress);
+    cout << "[Generating pyseer input] - Done!" << endl;
 
-    cout << "Number of unique patterns: " << getNbLinesInFile(outputFolder+string("/unitigs.unique_rows.Rtab")) << endl;
+    //cout << "Number of unique patterns: " << getNbLinesInFile(outputFolder+string("/unitigs.unique_rows.Rtab")) << endl;
 
     // Remove the global graph pointer, otherwise its destructor is called
     // twice by GATB (after main) giving a HDF5 error