From 6c3081662aacbe6a4e7ab9fdd078dbbc4ba4dd08 Mon Sep 17 00:00:00 2001 From: santeripuranen <26481940+santeripuranen@users.noreply.github.com> Date: Fri, 14 Feb 2020 00:02:32 +0200 Subject: [PATCH] Add option to gzip unitig output --- CMakeLists.txt | 4 +-- src/global.cpp | 2 ++ src/global.h | 1 + src/map_reads.cpp | 70 +++++++++++++++++++++++++++++++---------------- 4 files changed, 51 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c144336..7472d86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,7 @@ include (GatbCore) # find Boost set(BOOST_ROOT /usr/bin/include) set(Boost_USE_STATIC_LIBS ON) -FIND_PACKAGE(Boost 1.69.0 COMPONENTS program_options filesystem system regex REQUIRED) +FIND_PACKAGE(Boost 1.69.0 COMPONENTS program_options filesystem system regex iostreams REQUIRED) include_directories(${Boost_INCLUDE_DIR}) # cdbg-ops target @@ -102,7 +102,7 @@ set (PROGRAM_SOURCE_DIR ${PROJECT_SOURCE_DIR}/src) include_directories (${PROGRAM_SOURCE_DIR}) file (GLOB_RECURSE ProjectFiles ${PROGRAM_SOURCE_DIR}/*.cpp) add_executable(${PROGRAM} ${ProjectFiles}) -target_link_libraries(${PROGRAM} ${gatb-core-libraries} ${Boost_LIBRARIES} -static-libgcc -static-libstdc++) +target_link_libraries(${PROGRAM} ${gatb-core-libraries} ${Boost_LIBRARIES} -lz -static-libgcc -static-libstdc++) ################################################################################ # INSTALLATION diff --git a/src/global.cpp b/src/global.cpp index 1939c72..4ee5e48 100644 --- a/src/global.cpp +++ b/src/global.cpp @@ -36,6 +36,7 @@ const char* STR_STRAINS_FILE = "-strains"; const char* STR_KSKMER_SIZE = "-k"; const char* STR_OUTPUT = "-output"; const char* STR_NBCORES = "-nb-cores"; +const char* STR_GZIP = "-gzip"; //global vars used by both programs Graph *graph; @@ -47,4 +48,5 @@ void populateParser (Tool *tool) { tool->getParser()->push_front (new OptionOneParam (STR_OUTPUT, "Path to the folder where the final and temporary files will be stored.", false, "output")); tool->getParser()->push_front (new OptionOneParam (STR_KSKMER_SIZE, "K-mer size.", false, "31")); tool->getParser()->push_front (new OptionOneParam (STR_STRAINS_FILE, "A text file describing the strains containing 2 columns: 1) ID of the strain; 2) Path to a multi-fasta file containing the sequences of the strain. This file needs a header.", true)); + tool->getParser()->push_front (new OptionNoParam (STR_GZIP, "Compress unitig output using gzip.", false)); } diff --git a/src/global.h b/src/global.h index f0a7f2f..575d4a7 100644 --- a/src/global.h +++ b/src/global.h @@ -40,6 +40,7 @@ extern const char* STR_STRAINS_FILE; extern const char* STR_KSKMER_SIZE; extern const char* STR_OUTPUT; extern const char* STR_NBCORES; +extern const char* STR_GZIP; void populateParser (Tool *tool); diff --git a/src/map_reads.cpp b/src/map_reads.cpp index 0e078b0..c07bd28 100644 --- a/src/map_reads.cpp +++ b/src/map_reads.cpp @@ -32,10 +32,14 @@ #include "Utils.h" #include #include +#include +#include +#include #include #define NB_OF_READS_NOTIFICATION_MAP_AND_PHASE 10 //Nb of reads that the map and phase must process for notification using namespace std; +namespace io = boost::iostreams; void mapReadToTheGraphCore(const string &read, const Graph &graph, const vector< UnitigIdStrandPos > &nodeIdToUnitigId, boost::dynamic_bitset<>& unitigPattern ) { @@ -43,7 +47,7 @@ void mapReadToTheGraphCore(const string &read, const Graph &graph, const vector< //goes through all nodes/kmers of the read if (read.size() >= graph.getKmerSize()) { - for (int i = 0; i < read.size() - graph.getKmerSize() + 1; i++) { + for (std::size_t i = 0; i < read.size() - graph.getKmerSize() + 1; i++) { string LRKmer = string(read.c_str() + i, graph.getKmerSize()); //TODO: From my tests, if you use buildNode with a Kmer containing an 'N', it will build a node with all Ns replaced by G @@ -74,13 +78,6 @@ void mapReadToTheGraphCore(const string &read, const Graph &graph, const vector< } } } -/* -void mapReadToTheGraph(const string &read, int readfileIndex, unsigned long readIndex, const Graph &graph, - const vector< UnitigIdStrandPos > &nodeIdToUnitigId, map &unitigIdToCount) { - //map the read - mapReadToTheGraphCore(read, graph, nodeIdToUnitigId, unitigIdToCount); -} -*/ // We define a functor that will be cloned by the dispatcher struct MapAndPhase @@ -141,7 +138,7 @@ struct MapAndPhase for (it.first(); !it.isDone(); it.next()) { string read = (it.item()).toString(); //transform the read to upper case - for (int j=0;j > &XUT ) // transpose the input matrix; this is a fairly expensive operation // due to the horrible memory access patterns involved here - for( auto i=0; i< XUT.size(); ++i ) + for( std::size_t i=0; i, vector > getUnitigsWithSamePattern (const vec return pattern2Unitigs; } -void generate_XU(const string &filename, const string &nodesFile, const vector< boost::dynamic_bitset<> > &XU) { +bool init_sink( const std::string& filename, io::filtering_ostream& os, bool compress=false ) +{ + if( compress ) { + os.push( io::gzip_compressor() ); + os.push( io::file_sink(filename+".gz",std::ios::binary) ); + } + else { + os.push( io::file_sink(filename) ); + } + + return os.is_complete(); +} + +void generate_XU(const string &filename, const string &nodesFile, const vector< boost::dynamic_bitset<> > &XU, bool compress=false ) { using bitmap_t = boost::dynamic_bitset<>; - ofstream XUFile; - openFileForWriting(filename, XUFile); + //ofstream XUFile; + //openFileForWriting(filename, XUFile); + io::filtering_ostream XUFile; + if( !init_sink( filename, XUFile, compress ) ) { + cerr << "Unknown error when trying to open file \"" << filename << "\" for output!" << endl; + return; + } //open file with list of node sequences ifstream nodesFileReader; @@ -226,7 +241,7 @@ void generate_XU(const string &filename, const string &nodesFile, const vector< XUFile << endl; } nodesFileReader.close(); - XUFile.close(); + //XUFile.close(); // filtering_ostream's destructor does this for us } void generate_unique_id_to_original_ids(const string &filename, @@ -250,9 +265,14 @@ void generate_unique_id_to_original_ids(const string &filename, } void generate_XU_unique(const string &filename, const vector< boost::dynamic_bitset<> > &XU, - const map< boost::dynamic_bitset<>, vector > &pattern2Unitigs){ - ofstream XUUnique; - openFileForWriting(filename, XUUnique); + const map< boost::dynamic_bitset<>, vector > &pattern2Unitigs, bool compress=false ){ + //ofstream XUUnique; + //openFileForWriting(filename, XUUnique); + io::filtering_ostream XUUnique; + if( !init_sink( filename, XUUnique, compress ) ) { + cerr << "Unknown error when trying to open file \"" << filename << "\" for output!" << endl; + return; + } //print the header XUUnique << "pattern_id"; @@ -272,22 +292,23 @@ void generate_XU_unique(const string &filename, const vector< boost::dynamic_bit XUUnique << " " << pattern[i]; XUUnique << endl; } - XUUnique.close(); + //XUUnique.close(); // filtering_ostream's destructor does this for us } //generate the pyseer input void generatePyseerInput (const vector &allReadFilesNames, const string &outputFolder, const vector< boost::dynamic_bitset<> >& XU, - int nbContigs) { + int nbContigs, bool compress=false ) { //Generate the XU (the pyseer input - the unitigs are rows with strains present) //XU_unique is XU is in matrix form (for Rtab input) with the duplicated rows removed //create the files for pyseer { - generate_XU(outputFolder+string("/unitigs.txt"), outputFolder+string("/graph.nodes"), XU); + generate_XU(outputFolder+string("/unitigs.txt"), outputFolder+string("/graph.nodes"), XU, compress ); auto pattern2Unitigs = getUnitigsWithSamePattern(XU); + cout << "Number of unique patterns: " << pattern2Unitigs.size() << endl; generate_unique_id_to_original_ids(outputFolder+string("/unitigs.unique_rows_to_all_rows.txt"), pattern2Unitigs); - generate_XU_unique(outputFolder+string("/unitigs.unique_rows.Rtab"), XU, pattern2Unitigs); + generate_XU_unique(outputFolder+string("/unitigs.unique_rows.Rtab"), XU, pattern2Unitigs, compress ); } } @@ -300,6 +321,7 @@ void map_reads::execute () string tmpFolder = outputFolder+string("/tmp"); string longReadsFile = tmpFolder+string("/readsFile"); int nbCores = getInput()->getInt(STR_NBCORES); + const bool compress = getInput()->get(STR_GZIP); //get the nbContigs int nbContigs = getNbLinesInFile(outputFolder+string("/graph.nodes")); @@ -336,7 +358,7 @@ void map_reads::execute () // allUnitigPatterns has all samples/strains over the first dimension and // unitig presense patterns over the second dimension (in bitsets). // Here we transpose the matrix, while consuming it in order to gradually reduce memory footprint. - // For larger data sets this pattern accounting will dominate our emory footprint; overall memory consumption will peak here. + // For larger data sets this pattern accounting will dominate our memory footprint; overall memory consumption will peak here. // Peak memory use occurs at the start and will be twice the matrix size (= 2 * (nbContigs*strains->size()/8) bytes). cout << "[Transpose pattern matrix..]" << endl; auto XU = transposeXU( allUnitigPatterns ); // this will consume allUnitigPatterns while transposing @@ -344,10 +366,10 @@ void map_reads::execute () //generate the pyseer input cout << "[Generating pyseer input]..." << endl; - generatePyseerInput(allReadFilesNames, outputFolder, XU, nbContigs); + generatePyseerInput(allReadFilesNames, outputFolder, XU, nbContigs, compress); cout << "[Generating pyseer input] - Done!" << endl; - cout << "Number of unique patterns: " << getNbLinesInFile(outputFolder+string("/unitigs.unique_rows.Rtab")) << endl; + //cout << "Number of unique patterns: " << getNbLinesInFile(outputFolder+string("/unitigs.unique_rows.Rtab")) << endl; // Remove the global graph pointer, otherwise its destructor is called // twice by GATB (after main) giving a HDF5 error