diff --git a/src/BUSData.cpp b/src/BUSData.cpp index c97f80f..397e6dd 100644 --- a/src/BUSData.cpp +++ b/src/BUSData.cpp @@ -258,7 +258,7 @@ bool writeECs(const std::string &filename, const BUSHeader &header) { return true; } -bool writeGenes(const std::string &filename, const std::unordered_map &genenames) { +bool writeGenes(const std::string &filename, const u_map_ &genenames) { std::ofstream outf; outf.open(filename.c_str(), std::ios::out); @@ -279,7 +279,7 @@ bool writeGenes(const std::string &filename, const std::unordered_map &txnames) { +bool parseTranscripts(const std::string &filename, u_map_ &txnames) { std::ifstream inf(filename.c_str()); int i = 0; @@ -291,7 +291,7 @@ bool parseTranscripts(const std::string &filename, std::unordered_map &txnames, std::unordered_set &captures) { +bool parseTxCaptureList(const std::string &filename, u_map_ &txnames, std::unordered_set &captures) { std::ifstream inf(filename.c_str()); std::string txp; @@ -318,7 +318,7 @@ bool parseBcUmiCaptureList(const std::string &filename, std::unordered_set &project_map) { +bool parse_ProjectMap(const std::string &filename, u_map_ &project_map) { // This function occurs in 3 places: here, BUSData.h, and bustools_project.cpp std::ifstream inf(filename.c_str()); @@ -346,7 +346,7 @@ bool parseFlagsCaptureList(const std::string &filename, std::unordered_set &txnames, std::vector &genemap, std::unordered_map &genenames) { +bool parseGenes(const std::string &filename, const u_map_ &txnames, std::vector &genemap, u_map_ &genenames) { std::ifstream inf(filename.c_str()); std::string line, t; diff --git a/src/BUSData.h b/src/BUSData.h index 5c697f7..227f889 100644 --- a/src/BUSData.h +++ b/src/BUSData.h @@ -7,6 +7,7 @@ #include #include #include +#include "Common.hpp" const uint32_t BUSFORMAT_VERSION = 1; @@ -66,15 +67,15 @@ int identifyParseHeader(std::istream &inf, BUSHeader &header, compressed_BUSHead bool parseECs_stream(std::istream &in, BUSHeader &header); bool parseECs(const std::string &filename, BUSHeader &header); bool writeECs(const std::string &filename, const BUSHeader &header); -bool writeGenes(const std::string &filename, const std::unordered_map &genenames); -bool parseGenes(const std::string &filename, const std::unordered_map &txnames, std::vector &genemap, std::unordered_map &genenames); +bool writeGenes(const std::string &filename, const u_map_ &genenames); +bool parseGenes(const std::string &filename, const u_map_ &txnames, std::vector &genemap, u_map_ &genenames); bool parseGenesList(const std::string& filename, std::vector& geneNames); -bool parseTxCaptureList(const std::string &filename, std::unordered_map &txnames, std::unordered_set &captures); +bool parseTxCaptureList(const std::string &filename, u_map_ &txnames, std::unordered_set &captures); bool parseBcUmiCaptureList(const std::string &filename, std::unordered_set &captures); bool parseFlagsCaptureList(const std::string &filename, std::unordered_set &captures); -bool parseTranscripts(const std::string &filename, std::unordered_map &txnames); +bool parseTranscripts(const std::string &filename, u_map_ &txnames); -bool parse_ProjectMap(const std::string &filename, std::unordered_map &project_map); +bool parse_ProjectMap(const std::string &filename, u_map_ &project_map); uint64_t stringToBinary(const std::string &s, uint32_t &flag); uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag); diff --git a/src/Common.cpp b/src/Common.cpp index ad96c96..58c169c 100644 --- a/src/Common.cpp +++ b/src/Common.cpp @@ -73,7 +73,7 @@ std::vector intersect_vectors(const std::vector> & return std::move(u); } -int32_t intersect_ecs(const std::vector &ecs, std::vector &u, const std::vector &genemap, std::vector> &ecmap, std::unordered_map, int32_t, SortedVectorHasher> &ecmapinv, std::vector> &ec2genes) { +int32_t intersect_ecs(const std::vector &ecs, std::vector &u, const std::vector &genemap, std::vector> &ecmap, EcMapInv &ecmapinv, std::vector> &ec2genes) { if (ecs.empty()) { return -1; } @@ -91,13 +91,13 @@ int32_t intersect_ecs(const std::vector &ecs, std::vector &u, for (size_t i = 0; i< v.size(); i++) { u.push_back(v[i]); } - + for (size_t i = 1; i < ecs.size(); i++) { if (ecs[i] < 0 || ecs[i] >= ecmap.size()) { return -1; } const auto &v = ecmap[ecs[i]]; - + int j = 0; int k = 0; int l = 0; @@ -212,7 +212,7 @@ void intersect_genes_of_ecs(const std::vector &ecs, const std::vector< } -int32_t intersect_ecs_with_genes(const std::vector &ecs, const std::vector &genemap, std::vector> &ecmap, std::unordered_map, int32_t, SortedVectorHasher> &ecmapinv, std::vector> &ec2genes, bool assumeIntersectionIsEmpty) { +int32_t intersect_ecs_with_genes(const std::vector &ecs, const std::vector &genemap, std::vector> &ecmap, EcMapInv &ecmapinv, std::vector> &ec2genes, bool assumeIntersectionIsEmpty) { std::vector> gu; // per gene transcript results std::vector u; // final list of transcripts @@ -327,9 +327,82 @@ void create_ec2genes(const std::vector> &ecmap, const std:: } } +COUNT_MTX_TYPE intersect_ecs_with_subset_txs(int32_t ec, const std::vector> &ecmap, const std::vector& tx_split) { + if (tx_split.size() == 0) return COUNT_DEFAULT; + std::vector ecs; + ecs.push_back(ec); + return intersect_ecs_with_subset_txs(ecs, ecmap, tx_split); +} + +COUNT_MTX_TYPE intersect_ecs_with_subset_txs(const std::vector& ecs, const std::vector> &ecmap, const std::vector& tx_split) { + // Note: tx_split indices are tx ids and values are 1 (exists in split) or 0 (does not exist in split) + if (tx_split.size() == 0) return COUNT_DEFAULT; + if (ecs.size() == 0) return COUNT_AMBIGUOUS; // Shouldn't happen + std::vector u; + u.resize(0); + auto &v = ecmap[ecs[0]]; // copy + for (size_t i = 0; i< v.size(); i++) { + u.push_back(v[i]); + } + for (size_t i = 1; i < ecs.size(); i++) { + const auto &v = ecmap[ecs[i]]; + + int j = 0; + int k = 0; + int l = 0; + int n = u.size(); + int m = v.size(); + // u and v are sorted, j,k,l = 0 + while (j < n && l < m) { + // invariant: u[:k] is the intersection of u[:j] and v[:l], j <= n, l <= m + // u[:j] <= u[j:], v[:l] <= v[l:], u[j:] is sorted, v[l:] is sorted, u[:k] is sorted + if (u[j] < v[l]) { + j++; + } else if (u[j] > v[l]) { + l++; + } else { + // match + if (k < j) { + std::swap(u[k], u[j]); + } + k++; + j++; + l++; + } + } + if (k < n) { + u.resize(k); + } + } + size_t n_1 = 0; + size_t n_2 = 0; + for (auto t : u) { + if(tx_split[t]) { + n_2++; + } else { + n_1++; + } + if (n_1 > 0 && n_2 > 0) break; // Stop searching + } + return (n_1 > 0 && n_2 > 0 ? COUNT_AMBIGUOUS : (n_1 > 0 ? COUNT_DEFAULT : COUNT_SPLIT)); + /*for (auto ec : ecs) { // We still need to optimize this + for (auto t: ecmap[ec]) { + if(std::find(tx_split.begin(), tx_split.end(), t) != tx_split.end()) { + n_2++; + } else { + n_1++; + } + if (n_1 > 0 && n_2 > 0) break; // Stop searching + } + if (n_1 > 0 && n_2 > 0) break; // Stop searching + }*/ + //return (n_1 > 0 && n_2 > 0 ? COUNT_AMBIGUOUS : (n_1 > 0 ? COUNT_DEFAULT : COUNT_SPLIT)); +} + + void copy_file(std::string src, std::string dest) { std::ifstream isrc(src, std::ios::binary); std::ofstream idest(dest, std::ios::binary); idest << isrc.rdbuf(); -} \ No newline at end of file +} diff --git a/src/Common.hpp b/src/Common.hpp index f67ee5c..badd7c4 100644 --- a/src/Common.hpp +++ b/src/Common.hpp @@ -9,9 +9,12 @@ #include #include #include +#include "roaring.hh" +#include "hash.hpp" -#define BUSTOOLS_VERSION "0.42.0" +#define BUSTOOLS_VERSION "0.43.0" +#define u_map_ std::unordered_map enum CAPTURE_TYPE : char { CAPTURE_NONE = 0, @@ -35,6 +38,12 @@ enum PROJECT_TYPE : char PROJECT_TX, PROJECT_F }; +enum COUNT_MTX_TYPE : char +{ + COUNT_DEFAULT = 0, + COUNT_SPLIT, + COUNT_AMBIGUOUS +}; struct Bustools_opt { @@ -62,6 +71,7 @@ struct Bustools_opt std::string count_genes; std::string count_ecs; std::string count_txp; + std::string count_split; bool count_em = false; bool count_cm = false; bool count_collapse = false; @@ -75,6 +85,7 @@ struct Bustools_opt std::string dump; bool dump_bool = false; bool split_correct = false; + bool barcode_replacement = false; /* predict */ std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this @@ -98,6 +109,7 @@ struct Bustools_opt /* text */ bool text_dumpflags = false; bool text_dumppad = false; + bool text_showall = false; /* linker */ int start, end; @@ -151,22 +163,42 @@ struct SortedVectorHasher int i = 0; for (auto x : v) { - uint64_t t = std::hash{}(x); + uint64_t t; + MurmurHash3_x64_64(&x,sizeof(x), 0,&t); t = (x >> i) | (x << (64 - i)); r = r ^ t; - i = (i + 1) % 64; + i = (i+1)&63; } return r; } }; + +struct RoaringHasher { + size_t operator()(const Roaring& rr) const { + uint64_t r = 0; + int i=0; + for (auto x : rr) { + uint64_t t; + MurmurHash3_x64_64(&x, sizeof(x), 0, &t); + t = (x>>i) | (x<<(64-i)); + r ^= t; + i = (i+1)&63; // (i+1)%64 + } + return r; + } +}; +typedef u_map_, int32_t, SortedVectorHasher> EcMapInv; + std::vector intersect(std::vector &u, std::vector &v); std::vector union_vectors(const std::vector> &v); std::vector intersect_vectors(const std::vector> &v); -int32_t intersect_ecs(const std::vector &ecs, std::vector &u, const std::vector &genemap, std::vector> &ecmap, std::unordered_map, int32_t, SortedVectorHasher> &ecmapinv, std::vector> &ec2genes); +int32_t intersect_ecs(const std::vector &ecs, std::vector &u, const std::vector &genemap, std::vector> &ecmap, EcMapInv &ecmapinv, std::vector> &ec2genes); void vt2gene(const std::vector &v, const std::vector &genemap, std::vector &glist); void intersect_genes_of_ecs(const std::vector &ecs, const std::vector> &ec2genes, std::vector &glist); -int32_t intersect_ecs_with_genes(const std::vector &ecs, const std::vector &genemap, std::vector> &ecmap, std::unordered_map, int32_t, SortedVectorHasher> &ecmapinv, std::vector> &ec2genes, bool assumeIntersectionIsEmpty = true); +int32_t intersect_ecs_with_genes(const std::vector &ecs, const std::vector &genemap, std::vector> &ecmap, EcMapInv &ecmapinv, std::vector> &ec2genes, bool assumeIntersectionIsEmpty = true); void create_ec2genes(const std::vector> &ecmap, const std::vector &genemap, std::vector> &ec2gene); +COUNT_MTX_TYPE intersect_ecs_with_subset_txs(int32_t ec, const std::vector> &ecmap, const std::vector& tx_split); +COUNT_MTX_TYPE intersect_ecs_with_subset_txs(const std::vector& ecs, const std::vector> &ecmap, const std::vector& tx_split); void copy_file(std::string src, std::string dest); diff --git a/src/bustools_capture.cpp b/src/bustools_capture.cpp index d03e1fd..a90ff2e 100644 --- a/src/bustools_capture.cpp +++ b/src/bustools_capture.cpp @@ -12,11 +12,11 @@ void bustools_capture(Bustools_opt &opt) { std::unordered_set captures; std::vector> ecmap; - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + u_map_, int32_t, SortedVectorHasher> ecmapinv; if (opt.type == CAPTURE_TX) { // parse ecmap and capture list - std::unordered_map txnames; + u_map_ txnames; std::cerr << "Parsing transcripts .. "; std::cerr.flush(); parseTranscripts(opt.count_txp, txnames); std::cerr << "done" << std::endl; diff --git a/src/bustools_clusterhist.cpp b/src/bustools_clusterhist.cpp index 2bbb667..591eeeb 100644 --- a/src/bustools_clusterhist.cpp +++ b/src/bustools_clusterhist.cpp @@ -18,13 +18,13 @@ void bustools_clusterhist(Bustools_opt& opt) { // read and parse the equivelence class files - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + u_map_, int32_t, SortedVectorHasher> ecmapinv; std::vector> ecmap; - std::unordered_map txnames; + u_map_ txnames; parseTranscripts(opt.count_txp, txnames); std::vector genemap(txnames.size(), -1); - std::unordered_map genenames; + u_map_ genenames; parseGenes(opt.count_genes, txnames, genemap, genenames); parseECs(opt.count_ecs, h); ecmap = std::move(h.ecs); @@ -52,7 +52,7 @@ void bustools_clusterhist(Bustools_opt& opt) { //Read the cluster file std::vector clusterNames; - std::unordered_map bcClusters; + u_map_ bcClusters; { std::ifstream ifs(opt.cluster_input_file); uint32_t flag = 0; diff --git a/src/bustools_collapse.cpp b/src/bustools_collapse.cpp index 2111c9b..120d0ef 100644 --- a/src/bustools_collapse.cpp +++ b/src/bustools_collapse.cpp @@ -17,13 +17,13 @@ void bustools_collapse(Bustools_opt &opt) { // read and parse the equivelence class files - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + u_map_, int32_t, SortedVectorHasher> ecmapinv; std::vector> ecmap; - std::unordered_map txnames; + u_map_ txnames; parseTranscripts(opt.count_txp, txnames); std::vector genemap(txnames.size(), -1); - std::unordered_map genenames; + u_map_ genenames; parseGenes(opt.count_genes, txnames, genemap, genenames); parseECs(opt.count_ecs, h); ecmap = std::move(h.ecs); diff --git a/src/bustools_correct.cpp b/src/bustools_correct.cpp index 588b744..ef50271 100644 --- a/src/bustools_correct.cpp +++ b/src/bustools_correct.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "Common.hpp" @@ -258,6 +259,7 @@ void bustools_split_correct(Bustools_opt &opt) } int rc = 0; + uint64_t len_mask = ((1ULL << (2*bclen)) - 1); // Only include n least significant bits where n=2*bclen while (true) { in.read((char *)p, N * sizeof(BUSData)); @@ -276,7 +278,7 @@ void bustools_split_correct(Bustools_opt &opt) bd = p[i]; - uint64_t b = bd.barcode; + uint64_t b = bd.barcode & len_mask; uint64_t bc12 = b & mask_12; uint64_t bc34 = (b >> (2 * len_12)) & mask_34; @@ -366,14 +368,14 @@ void bustools_split_correct(Bustools_opt &opt) if (dump_bool) { - if (bd.barcode != old_barcode) + if ((bd.barcode & len_mask) != old_barcode) { - of << binaryToString(bd.barcode, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n"; - old_barcode = bd.barcode; + of << binaryToString(bd.barcode & len_mask, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n"; + old_barcode = bd.barcode & len_mask; } } - bd.barcode = b_corrected; + bd.barcode = b_corrected | (bd.barcode & ~len_mask); bus_out.write((char *)&bd, sizeof(bd)); if (corrected_12_flag && corrected_34_flag) @@ -415,10 +417,268 @@ void bustools_split_correct(Bustools_opt &opt) p = nullptr; } -void bustools_correct(Bustools_opt &opt) -{ +void bustools_correct_replace(Bustools_opt &opt) { uint32_t bclen = 0; + uint32_t umilen = 0; + uint32_t rplen = 0; uint32_t wc_bclen = 0; + BUSHeader h; + size_t nr = 0; + size_t N = 100000; + BUSData *p = new BUSData[N]; + char magic[4]; + uint32_t version = 0; + size_t stat_white = 0; + size_t stat_uncorr = 0; + uint64_t old_barcode; + enum replacement_type { bc_record, msb_meta, lsb_meta, msb_bus, lsb_bus }; + + // There are five replacement types: + // bc_record: simply replace the barcode of length 'bclen' with another barcode of length 'bclen' (while preserving metadata) + // msb_meta: put replacement (format: NNNN<) into most significant bits of metadata in BUS record (preserve barcode record) + // lsb_meta: put replacement (format: rp_map; // Replacement map (key = onlisted bc; value = replacement bc) + replacement_type rtype = bc_record; + uint32_t f = 0; + + while (std::getline(wf, line)) { + std::stringstream ss(line); + std::string barcode; + std::string replacement; + ss >> barcode >> replacement; + if (barcode.empty() || replacement.empty()) continue; + if (!barcode.empty() && replacement.empty()) { + std::cerr << "Error: replacement file malformed; no replacement found for barcode: " << barcode << std::endl; + exit(1); + } + std::transform(barcode.begin(), barcode.end(), barcode.begin(), ::toupper); + std::transform(replacement.begin(), replacement.end(), replacement.begin(), ::toupper); + replacement_type rtype_ = bc_record; + if (replacement[0] == '<') rtype_ = lsb_meta; + if (replacement[0] == '*') rtype_ = lsb_bus; + if (rtype_ != bc_record) { + replacement = replacement.substr(1); + } else { + if (replacement[replacement.length()-1] == '<') rtype_ = msb_meta; + if (replacement[replacement.length()-1] == '*') rtype_ = msb_bus; + if (rtype_ != bc_record) { + replacement = replacement.substr(0, replacement.length()-1); + } + } + if (replacement.length() == 0) continue; + + if (wc_bclen == 0) { + rtype = rtype_; + wc_bclen = barcode.length(); + rplen = replacement.length(); + } + if (rtype != rtype_) { + std::cerr << "Error: Replacement types not consistent in file" << std::endl; + exit(1); + } + if (wc_bclen != barcode.length()) { + std::cerr << "Error: Barcode lengths not consistent in file" << std::endl; + exit(1); + } + + uint64_t rl = replacement.length(); + uint64_t bc = stringToBinary(barcode, f); + uint64_t rp = stringToBinary(replacement, f); + if (rp_map.find(bc) != rp_map.end()) { + std::cerr << "Error: Duplicate entries found: " << barcode << std::endl; + exit(1); + } + if (rplen != rl) { + std::cerr << "Error: replacement length in list inconsistent" << std::endl; + exit(1); + } + rp_map.insert(std::make_pair(bc, rp)); + } + wf.close(); + + if (rp_map.size() == 0) { + std::cerr << "Error: replacement file malformed; no barcodes found" < rplen) { + h.bclen = rplen; + } + } + if (umilen == 0) { + umilen = h.umilen; + } + + if (!outheader_written) { + writeHeader(bus_out, h); + outheader_written = true; + } + + int rc = 0; + uint64_t len_mask = ((1ULL << (2*bclen)) - 1); // Only include n least significant bits where n=2*bclen + while (true) { + in.read((char *)p, N * sizeof(BUSData)); + size_t rc = in.gcount() / sizeof(BUSData); + if (rc == 0) { + break; + } + nr += rc; + + for (size_t i = 0; i < rc; i++) { + bd = p[i]; + uint64_t b = bd.barcode & len_mask; + uint64_t correction = 0; + uint64_t b_lookup = b; + if (rtype == lsb_bus || rtype == msb_meta || rtype == lsb_meta) { + // For these, look up based off LSBs (off of wc_bclen) + b_lookup = b_lookup & ((1ULL << (2*wc_bclen)) - 1); + } + if (rtype == msb_bus) { + // For this, look up based off MSBs (from beginning of barcode) + if (bclen >= rplen) { + b_lookup = b_lookup & (~(1ULL << (2*(bclen-rplen)))); + } + } + auto it = rp_map.find(b_lookup); + if (it != rp_map.end()) { + stat_white++; + correction = it->second; + switch (rtype) { + case bc_record: // This is the only option where we'll allow replacement to be shorter than barcode + { + uint64_t len_mask2 = ((1ULL << (2*std::max(rplen, bclen))) - 1); // n least significant bits where n=2*max(rplen,bclen) [if rplen > bclen, overwrite based on rplen] + bd.barcode = bd.barcode & ~len_mask2; // Preserve the metadata bits outside barcode length (or overwrites the part where the barcode length exceeds it) + if (rplen < bclen) { + bd.barcode = bd.barcode >> (2*(bclen-rplen)); + bd.barcode = bd.barcode & ~((1ULL << (2*(rplen))) - 1); // Delete everything within rplen (i.e. where the correction will eventually be) + } + bd.barcode = correction | (bd.barcode & ~len_mask2); // Correction + break; + } + case msb_meta: + { + uint64_t clen = 32-rplen; // 32 minus correction length + bd.barcode = bd.barcode & ((1ULL << (2*clen)) - 1); // Unset the MSBs + bd.barcode = (correction << (2*clen) ) | bd.barcode; // Shift the corrected sequence into the MSBs + break; + } + case lsb_meta: + { + uint64_t original_bc = bd.barcode & len_mask; // The original barcode sequence (no metadata) + bd.barcode = bd.barcode << (2*rplen); // Shift the barcode+metadata to the left based on rplen + uint64_t mlen = (rplen+bclen); // How much space the new metadata plus the original barcode will take up + bd.barcode = bd.barcode & (~((1ULL << (2*mlen)) - 1)); // Preserve only the bits containing the (shifted) metadata + bd.barcode = bd.barcode | original_bc; // Throw the original barcode back in + bd.barcode = (correction << (2*bclen) ) | bd.barcode; // Throw the new metadata in + break; + } + case msb_bus: + { + if (bclen >= rplen) { // Only do the substitution if barcode encapsulates the replacement + uint64_t mdata = bd.barcode & (~((1ULL << (2*bclen)) - 1)); // Preserve only the bits containing the metadata (not the barcode) + uint64_t bdata = bd.barcode & ((1ULL << (2*(bclen-rplen))) - 1); // Preserve only the bits containing the part of the barcode we want to keep + bd.barcode = (mdata | bdata) | (correction << (2*(bclen-rplen))); // Merge everything together + } + break; + } + case lsb_bus: + { + bd.barcode = correction | (bd.barcode & ~((1ULL << (2*rplen)) - 1)); // Set 2*rplen LSBs to 0, and put the new replacement in + break; + } + } + bus_out.write((char *)&bd, sizeof(bd)); + if (dump_bool) { + if (b != old_barcode) { + of << binaryToString(b, bclen) << "\t" << binaryToString(correction, bclen) << "\n"; + old_barcode = b; + } + } + } else { + // No correction; except shift metadata right if necessary + if (rtype == bc_record && rplen < bclen) { + uint64_t shifted_bc = bd.barcode >> (2*(bclen-rplen)); + shifted_bc = shifted_bc & ~((1ULL << (2*(rplen))) - 1); // Delete everything within rplen (i.e. where the replacement would be) + bd.barcode = (bd.barcode & ((1ULL << (2*(rplen))) - 1)); // Preserve only the LSB rlen stuff + bd.barcode = shifted_bc | bd.barcode; // Merge + } + bus_out.write((char *)&bd, sizeof(bd)); + stat_uncorr++; + } + } + } + } + + std::cerr << "Processed " << nr << " BUS records" << std::endl + << "Replaced = " << stat_white << std::endl + << "Not replaced = " << stat_uncorr << std::endl; + + if (!opt.stream_out) { + busf_out.close(); + } + + if (opt.dump_bool) { + of.close(); // if of is open + } + + delete[] p; + p = nullptr; +} + +void bustools_correct(Bustools_opt &opt) { + if (opt.barcode_replacement) { // Run in replacement mode + bustools_correct_replace(opt); + return; + } + uint32_t bclen = 0; + std::vector wc_bclen; uint32_t umilen = 0; BUSHeader h; size_t nr = 0; @@ -434,57 +694,97 @@ void bustools_correct(Bustools_opt &opt) bool dump_bool = opt.dump_bool; std::ofstream of; - if (dump_bool) - { + if (dump_bool) { of.open(opt.dump); } std::ifstream wf(opt.whitelist, std::ios::in); std::string line; line.reserve(100); - std::unordered_set wbc; - wbc.reserve(100000); + std::vector > wbc; // Each set contains whitelist uint32_t f = 0; - while (std::getline(wf, line)) - { - if (wc_bclen == 0) - { - wc_bclen = line.size(); + bool first_line = true; + while (std::getline(wf, line)) { + std::stringstream ss(line); + std::string barcode; + int i = 0; + while (ss >> barcode) { + std::transform(barcode.begin(), barcode.end(), barcode.begin(), ::toupper); + uint64_t bc = stringToBinary(barcode, f); + if (first_line) { // First line establishes all the barcode sets (can't have any empty barcodes here) + std::unordered_set bc_set; + bc_set.insert(bc); + wbc.push_back(bc_set); + wbc[i].reserve(100000); + wc_bclen.push_back(barcode.size()); + } else if (barcode == "-") { + i++; + continue; // Empty barcode + } else if (i >= wbc.size()) { // Too many barcodes in this line + std::cerr << "Error: on-list file malformed; encountered " << (i+1) + << " barcodes on a line while " << wbc.size() << " barcodes on a previous line" + << std::endl; + exit(1); + } else if (barcode.length() != wc_bclen[i]) { + std::cerr << "Error: on-list file malformed; encountered barcode length " << wc_bclen[i] + << " on a line but barcode length " << barcode.length() << " on another line" + << std::endl; + exit(1); + } else { + wbc[i].insert(bc); + } + i++; } - uint64_t bc = stringToBinary(line, f); - wbc.insert(bc); + if (i == 0) continue; // empty line + first_line = false; } wf.close(); - std::cerr << "Found " << wbc.size() << " barcodes in the whitelist" << std::endl; - - // split barcode into upper and lower half - size_t bc2 = (wc_bclen + 1) / 2; - - std::vector> correct(1ULL << (2 * bc2)); // 4^(bc/2) possible barcodes - - uint64_t mask_size = (1ULL << (2 * bc2)); - uint64_t lower_mask = (1ULL << (2 * bc2)) - 1; - uint64_t upper_mask = (1ULL << (2 * (wc_bclen - bc2))) - 1; - for (uint64_t b : wbc) - { - uint64_t lb = b & lower_mask; - uint64_t ub = (b >> (2 * bc2)) & upper_mask; - - correct[ub].second.add(lb); - correct[lb].first.add(ub); + if (wbc.size() == 0) { + std::cerr << "Error: on-list file malformed; no barcodes found" < 1) { + std::cerr << "Found " << wbc.size() << " barcode sets" << std::endl; + } + + // split barcode into upper and lower half (across all barcodes in a barcode set) + std::vector>> correct_vec; // size of vector = how many barcode sets there are + std::vector > lower_upper_mask_vec; // size of vector = how many barcode sets there are + std::vector bc2_vec; // size of vector = how many barcode sets there are + for (int i = 0; i < wc_bclen.size(); i++) { + auto bclen2 = wc_bclen[i]; // i = index of current barcode set + size_t bc2 = (bclen2 + 1) / 2; + std::vector> correct(1ULL << (2 * bc2)); // 4^(bc/2) possible barcodes + uint64_t mask_size = (1ULL << (2 * bc2)); + uint64_t lower_mask = (1ULL << (2 * bc2)) - 1; + uint64_t upper_mask = (1ULL << (2 * (bclen2 - bc2))) - 1; + for (uint64_t b : wbc[i]) { // Iterate through barcodes of current barcode set + uint64_t lb = b & lower_mask; + uint64_t ub = (b >> (2 * bc2)) & upper_mask; + correct[ub].second.add(lb); + correct[lb].first.add(ub); + } + correct_vec.push_back(std::move(correct)); + lower_upper_mask_vec.push_back(std::make_pair(lower_mask, upper_mask)); + bc2_vec.push_back(bc2); } std::streambuf *buf = nullptr; std::ofstream busf_out; - if (!opt.stream_out) - { + if (!opt.stream_out) { busf_out.open(opt.output, std::ios::out | std::ios::binary); buf = busf_out.rdbuf(); } - else - { + else { buf = std::cout.rdbuf(); } std::ostream bus_out(buf); @@ -493,111 +793,121 @@ void bustools_correct(Bustools_opt &opt) nr = 0; BUSData bd; - for (const auto &infn : opt.files) - { + for (const auto &infn : opt.files) { std::streambuf *inbuf; std::ifstream inf; - if (!opt.stream_in) - { + if (!opt.stream_in) { inf.open(infn.c_str(), std::ios::binary); inbuf = inf.rdbuf(); - } - else - { + } else { inbuf = std::cin.rdbuf(); } std::istream in(inbuf); parseHeader(in, h); - if (!outheader_written) - { + if (!outheader_written) { writeHeader(bus_out, h); outheader_written = true; } - if (bclen == 0) - { + if (bclen == 0) { bclen = h.bclen; + size_t final_wc_bclen = 0; + + for (auto l : wc_bclen) { + final_wc_bclen += l; + } - if (bclen != wc_bclen) - { - std::cerr << "Error: barcode length and whitelist length differ, barcodes = " << bclen << ", whitelist = " << wc_bclen << std::endl - << " check that your whitelist matches the technology used" << std::endl; + if (bclen != final_wc_bclen) { + std::cerr << "Error: barcode length and on-list length differ, barcodes = " << bclen << ", on-list = " << final_wc_bclen << std::endl + << " check that your on-list matches the technology used" << std::endl; exit(1); } } - if (umilen == 0) - { + if (umilen == 0) { umilen = h.umilen; } int rc = 0; + uint64_t len_mask = ((1ULL << (2*bclen)) - 1); // Only include n least significant bits where n=2*bclen while (true) { in.read((char *)p, N * sizeof(BUSData)); size_t rc = in.gcount() / sizeof(BUSData); - if (rc == 0) - { + if (rc == 0) { break; } nr += rc; - for (size_t i = 0; i < rc; i++) - { + for (size_t i = 0; i < rc; i++) { bd = p[i]; - auto it = wbc.find(bd.barcode); - if (it != wbc.end()) - { - stat_white++; - bus_out.write((char *)&bd, sizeof(bd)); - } - else - { - uint64_t b = bd.barcode; - uint64_t lb = b & lower_mask; - uint64_t ub = (b >> (2 * bc2)) & upper_mask; - uint64_t lbc = 0, ubc = 0; - int correct_lower = search_for_mismatch(correct[ub].second, bc2, lb, lbc); - int correct_upper = search_for_mismatch(correct[lb].first, wc_bclen - bc2, ub, ubc); - int nc = correct_lower + correct_upper; - if (nc != 1) - { - stat_uncorr++; + uint64_t b = bd.barcode & len_mask; + uint64_t running_len = 0; + size_t stat_white_ = 0; + size_t stat_uncorr_ = 0; + size_t stat_corr_ = 0; + uint64_t correction = 0; + for (int j = wbc.size()-1; j >= 0; j--) { // Iterate through all the barcode sets + auto bclen2 = wc_bclen[j]; + running_len += bclen2; + uint64_t shift_len = 2*(running_len-bclen2); // used for masking out the least significant bits up to the current barcode + uint64_t len_mask2 = 0; // This mask = Only consider these bits (based on each barcode set) + len_mask2 = ((1ULL << (2*running_len)) - 1); + if (shift_len != 0) { + len_mask2 &= (~((1ULL << (shift_len)) - 1)); } - else if (nc == 1) - { - if (correct_lower == 1) - { - uint64_t b_corrected = (ub << (2 * bc2)) | lbc; - if (dump_bool) - { - if (bd.barcode != old_barcode) - { - of << binaryToString(bd.barcode, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n"; - old_barcode = bd.barcode; - } + len_mask2 &= ((1ULL << (2*running_len)) - 1); + len_mask2 &= len_mask; // not necessary + uint64_t b_ = b & len_mask2; // The barcode alone in the location that it appears in + uint64_t b_shifted = b_ >> shift_len; // The barcode shifted into the least significant bits location + auto it = wbc[j].find(b_shifted); + if (it != wbc[j].end()) { // Barcode is in the whitelist + stat_white_++; + correction |= b_; + } else { + auto lower_mask = lower_upper_mask_vec[j].first; + auto upper_mask = lower_upper_mask_vec[j].second; + auto bc2 = bc2_vec[j]; + auto& correct = correct_vec[j]; + uint64_t lb = b_shifted & lower_mask; + uint64_t ub = (b_shifted >> (2 * bc2)) & upper_mask; + uint64_t lbc = 0, ubc = 0; + int correct_lower = search_for_mismatch(correct[ub].second, bc2, lb, lbc); + int correct_upper = search_for_mismatch(correct[lb].first, bclen2 - bc2, ub, ubc); + int nc = correct_lower + correct_upper; + if (nc != 1) { // Uncorrected + stat_uncorr_++; + break; + } else if (nc == 1) { + stat_corr_++; + if (correct_lower == 1) { + uint64_t b_corrected = (ub << (2 * bc2)) | lbc; + b_corrected = b_corrected << shift_len; // We have the corrected barcode in the correct location + b_corrected &= len_mask2; + correction |= b_corrected; // Add onto existing correction + } else if (correct_upper == 1) { + uint64_t b_corrected = (ubc << (2 * bc2)) | lb; + b_corrected = b_corrected << shift_len; // We have the corrected barcode in the correct location + b_corrected &= len_mask2; + correction |= b_corrected; // Add onto existing correction } - - bd.barcode = b_corrected; - bus_out.write((char *)&bd, sizeof(bd)); - stat_corr++; } - else if (correct_upper == 1) - { - uint64_t b_corrected = (ubc << (2 * bc2)) | lb; - if (dump_bool) - { - if (bd.barcode != old_barcode) - { - of << binaryToString(bd.barcode, bclen) << "\t" << binaryToString(b_corrected, bclen) << "\n"; - old_barcode = bd.barcode; - } - } - - bd.barcode = b_corrected; - bus_out.write((char *)&bd, sizeof(bd)); - stat_corr++; + } + } + if (stat_uncorr_ > 0) { + stat_uncorr++; // Uncorrected; do not write BUS record + } else if (stat_white_ == wbc.size()) { + stat_white++; + bus_out.write((char *)&bd, sizeof(bd)); // No correction; just write BUS record as-is + } else if (stat_corr_ > 0) { + stat_corr++; // Corrected; and write it out + bd.barcode = correction | (bd.barcode & ~len_mask); // Correction plus preserve the metadata bits outside barcode length + bus_out.write((char *)&bd, sizeof(bd)); + if (dump_bool) { + if (b != old_barcode) { + of << binaryToString(b, bclen) << "\t" << binaryToString(correction, bclen) << "\n"; + old_barcode = b; } } } @@ -606,20 +916,18 @@ void bustools_correct(Bustools_opt &opt) } std::cerr << "Processed " << nr << " BUS records" << std::endl - << "In whitelist = " << stat_white << std::endl + << "In on-list = " << stat_white << std::endl << "Corrected = " << stat_corr << std::endl << "Uncorrected = " << stat_uncorr << std::endl; - if (!opt.stream_out) - { + if (!opt.stream_out) { busf_out.close(); } - if (opt.dump_bool) - { + if (opt.dump_bool) { of.close(); // if of is open } delete[] p; p = nullptr; -} \ No newline at end of file +} diff --git a/src/bustools_count.cpp b/src/bustools_count.cpp index e5961e5..8ccb6a8 100644 --- a/src/bustools_count.cpp +++ b/src/bustools_count.cpp @@ -18,13 +18,18 @@ void bustools_count(Bustools_opt &opt) { // read and parse the equivalence class files - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + EcMapInv ecmapinv; std::vector> ecmap; - std::unordered_map txnames; + u_map_ txnames; + auto txnames_split = txnames; // copy + std::vector tx_split; // Store transcript names for split + std::vector tx_split_lookup; // Map transcript IDs to mtx status + bool count_split = !opt.count_split.empty(); + int count_mtx_priority = !opt.count_gene_multimapping && count_split ? 1 : 0; // 1 = when something in tx_split overlaps something not in tx_split, prioritize the latter (useful for dealing in cases when introns of one gene overlap exons of another gene [we prioritize the exons] parseTranscripts(opt.count_txp, txnames); std::vector genemap(txnames.size(), -1); - std::unordered_map genenames; + u_map_ genenames; parseGenes(opt.count_genes, txnames, genemap, genenames); parseECs(opt.count_ecs, h); ecmap = std::move(h.ecs); @@ -32,19 +37,23 @@ void bustools_count(Bustools_opt &opt) { for (int32_t ec = 0; ec < ecmap.size(); ec++) { ecmapinv.insert({ecmap[ec], ec}); } - std::vector> ec2genes; + std::vector> ec2genes, ec2genes_priority; create_ec2genes(ecmap, genemap, ec2genes); std::ofstream of; + std::ofstream of_2; + std::ofstream of_A; std::string mtx_ofn = opt.output + ".mtx"; + std::string mtx_ofn_split_2 = opt.output + ".2.mtx"; + std::string mtx_ofn_split_A = opt.output + ".ambiguous.mtx"; std::string barcodes_ofn = opt.output + ".barcodes.txt"; + std::string barcodes_prefix_ofn = opt.output + ".barcodes.prefix.txt"; std::string ec_ofn = opt.output + ".ec.txt"; std::string gene_ofn = opt.output + ".genes.txt"; std::string hist_ofn = opt.output + ".hist.txt"; std::string cu_per_cell_ofn = opt.output + ".CUPerCell.txt"; std::string cu_ofn = opt.output + ".cu.txt"; - of.open(mtx_ofn); // write out the initial header // keep the number of newlines constant, this way it will work for both Windows and Linux @@ -54,12 +63,46 @@ void bustools_count(Bustools_opt &opt) { ssHeader << headerComments; ssHeader << std::string(66, '%') << '\n'; size_t headerLength = ssHeader.str().length(); + // If we need to split matrix + if (count_split) { + parseTranscripts(opt.count_split, txnames_split); // subset of txnames + tx_split.reserve(txnames_split.size()); + tx_split_lookup.resize(txnames.size(), -1); + for (auto x : txnames_split) { + if (txnames.count(x.first)) tx_split.push_back(txnames[x.first]); + } + of_2.open(mtx_ofn_split_2); + of_A.open(mtx_ofn_split_A); + of_2 << ssHeader.str(); + of_A << ssHeader.str(); + auto ecmap_ = ecmap; // copy + for (int ec = 0; ec < ecmap.size(); ec++) { // Get new ecmap based on split + for (auto tx : ecmap[ec]) { + auto &new_ec = ecmap_[ec]; + bool found = std::find(tx_split.begin(), tx_split.end(), tx) != tx_split.end(); + tx_split_lookup[tx] = found; + // Remove transcripts depending on whether they're found in tx_split + // Note: It is possible for one of the new ECs to be empty, in which case intersect_genes_of_ecs will result in the empty set for glist + // Essentially, we are removing all tx's that belong to (or not belong to) tx_split in the equivalence classes + // This handles instances in which a read maps to exon of one gene but intron of another (likely overlapping) gene to avoid discarding the record + // This is done at read-level (not UMI-level) so if one UMI maps to one gene's exon but another UMI maps to the other gene's intron, we still discard it + if (count_mtx_priority == 1 && !found) + new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end()); + else if (count_mtx_priority == 2 && found) + new_ec.erase(std::remove(new_ec.begin(), new_ec.end(), tx), new_ec.end()); + } + } + if (count_mtx_priority != 0) + create_ec2genes(ecmap_, genemap, ec2genes_priority); // Note: Some ECs may not be associated with any genes (i.e. empty vector) + } + of.open(mtx_ofn); of << ssHeader.str(); - size_t n_cols = 0; size_t n_rows = 0; size_t n_entries = 0; + size_t n_entries_2 = 0; + size_t n_entries_A = 0; std::vector v; v.reserve(N); uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL; @@ -70,7 +113,7 @@ void bustools_count(Bustools_opt &opt) { std::vector u; u.reserve(100); std::vector column_v; - std::vector> column_vp; + std::vector>> column_vp; // gene, {count, matrix type} if (!opt.count_collapse) { column_v.reserve(N); } else { @@ -131,6 +174,7 @@ void bustools_count(Bustools_opt &opt) { if (opt.umi_gene_collapse) { intersect_genes_of_ecs(ecs,ec2genes, glist); + if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist); } if (opt.umi_gene_collapse && glist.size() == 0) { // Gene-intersection zero, check for UMI collision @@ -213,8 +257,11 @@ void bustools_count(Bustools_opt &opt) { } } double val = j-i; - of << n_rows << " " << (column_v[i]+1) << " " << val << "\n"; - n_entries++; + auto which_mtx = intersect_ecs_with_subset_txs(column_v[i], ecmap, tx_split_lookup); + auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A); + auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A); + of_ << n_rows << " " << (column_v[i]+1) << " " << val << "\n"; + n_entries_++; i = j; // increment } @@ -235,7 +282,7 @@ void bustools_count(Bustools_opt &opt) { std::vector> ambiguous_genes; - for (size_t i = 0; i < n; ) { + if (!opt.count_cm) for (size_t i = 0; i < n; ) { // Entire loop is for !opt.count_cm size_t j = i+1; for (; j < n; j++) { if (v[i].UMI != v[j].UMI) { @@ -252,6 +299,7 @@ void bustools_count(Bustools_opt &opt) { } intersect_genes_of_ecs(ecs,ec2genes, glist); + if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist); int gn = glist.size(); if (opt.count_downsampling_factor != 1.0) { uint32_t newCounts = 0; @@ -266,9 +314,10 @@ void bustools_count(Bustools_opt &opt) { } } if (gn > 0) { + auto which_mtx = intersect_ecs_with_subset_txs(ecs, ecmap, tx_split_lookup); if (opt.count_gene_multimapping) { for (auto x : glist) { - column_vp.push_back({x, (opt.count_raw_counts ? counts : 1.0)/gn}); + column_vp.push_back({x, {(opt.count_raw_counts ? counts : 1.0)/gn, which_mtx}}); } //Fill in histograms for prediction. if (opt.count_gen_hist) { @@ -284,7 +333,7 @@ void bustools_count(Bustools_opt &opt) { } } else { if (gn==1) { - column_vp.push_back({glist[0],opt.count_raw_counts ? counts : 1.0}); + column_vp.push_back({glist[0],{opt.count_raw_counts ? counts : 1.0, which_mtx}}); //Fill in histograms for prediction. if (opt.count_gen_hist) { if (glist[0] < n_genes) { //crasches with an invalid gene file otherwise @@ -327,16 +376,18 @@ void bustools_count(Bustools_opt &opt) { } if (glist.size() == 0) { intersect_genes_of_ecs(ecs_within_molecule, ec2genes, glist); + if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs_within_molecule, ec2genes_priority, glist); } gn = glist.size(); if (gn > 0) { + auto which_mtx = intersect_ecs_with_subset_txs(ecs_within_molecule, ecmap, tx_split_lookup); if (opt.count_gene_multimapping) { for (auto x : glist) { - column_vp.push_back({x, 1.0/gn}); + column_vp.push_back({x, {1.0/gn, which_mtx}}); } } else { if (gn==1) { - column_vp.push_back({glist[0],1.0}); + column_vp.push_back({glist[0],{1.0, which_mtx}}); } else if (opt.count_em) { ambiguous_genes.push_back(std::move(glist)); } @@ -345,32 +396,72 @@ void bustools_count(Bustools_opt &opt) { } } i = j; // increment + } else for (size_t i = 0; i < n; i++) { // Entire loop is for opt.count_cm + ecs.resize(0); + ecs.push_back(v[i].ec); + + intersect_genes_of_ecs(ecs, ec2genes, glist); + if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist); + int gn = glist.size(); + if (gn > 0) { + auto which_mtx = intersect_ecs_with_subset_txs(ecs, ecmap, tx_split_lookup); + if (opt.count_gene_multimapping) { + for (auto x : glist) { + column_vp.push_back({x, {v[i].count/gn, which_mtx}}); + } + } else { + if (gn==1) { + column_vp.push_back({glist[0],{v[i].count, which_mtx}}); + } + } + } } std::sort(column_vp.begin(), column_vp.end()); size_t m = column_vp.size(); - std::unordered_map col_map(m); + u_map_ col_map(m); + auto col_map_2 = col_map; // copy + auto col_map_A = col_map; // copy std::vector cols; for (size_t i = 0; i < m; ) { size_t j = i+1; - double val = column_vp[i].second; + double val = 0; + double val_2 = 0; + double val_A = 0; + auto mtx_type = column_vp[i].second.second; + if (mtx_type == COUNT_DEFAULT) val = column_vp[i].second.first; + else if (mtx_type == COUNT_SPLIT) val_2 = column_vp[i].second.first; + else val_A = column_vp[i].second.first; for (; j < m; j++) { if (column_vp[i].first != column_vp[j].first) { break; } - val += column_vp[j].second; + auto mtx_type = column_vp[j].second.second; + if (mtx_type == COUNT_DEFAULT) val += column_vp[j].second.first; + else if (mtx_type == COUNT_SPLIT) val_2 += column_vp[j].second.first; + else val_A += column_vp[j].second.first; + } + if (!count_split || val != 0) col_map.insert({column_vp[i].first,val}); + if (count_split) { + if (val_2 != 0) col_map_2.insert({column_vp[i].first,val_2}); + if (val_A != 0) col_map_A.insert({column_vp[i].first,val_A}); } - col_map.insert({column_vp[i].first,val}); cols.push_back(column_vp[i].first); - n_entries++; + if (count_split) { + if (val != 0) n_entries++; + if (val_2 != 0) n_entries_2++; + if (val_A != 0) n_entries_A++; + } else { + n_entries++; + } i = j; // increment } if (opt.count_em) { //std::cerr << "Running EM algorithm" << std::endl; - std::unordered_map c1,c2; + u_map_ c1,c2; // initialize with unique counts for (const auto &x : cols) { double val = 0; @@ -426,19 +517,77 @@ void bustools_count(Bustools_opt &opt) { } - - for (const auto &x : cols) { double val = 0; auto it = col_map.find(x); - if (it != col_map.end()) { - val = it->second; + if (!count_split) { + if (it != col_map.end()) val = it->second; + of << n_rows << " " << (x+1) << " " << val << "\n"; + } else { + if (it != col_map.end()) { + val = it->second; + of << n_rows << " " << (x+1) << " " << val << "\n"; + } + it = col_map_2.find(x); + if (it != col_map_2.end()) { + val = it->second; + of_2 << n_rows << " " << (x+1) << " " << val << "\n"; + } + it = col_map_A.find(x); + if (it != col_map_A.end()) { + val = it->second; + of_A << n_rows << " " << (x+1) << " " << val << "\n"; + } + } + } + }; + + auto write_barcode_matrix_mult = [&](const std::vector &v) { + if(v.empty()) { + return; + } + column_vp.resize(0); + n_rows+= 1; + + barcodes.push_back(v[0].barcode); + double val = 0.0; + size_t n = v.size(); + + for (size_t i = 0; i < n; i++) { + int32_t ec = v[i].ec; + if (!opt.count_gene_multimapping) { + ecs.resize(0); + ecs.push_back(ec); + intersect_genes_of_ecs(ecs, ec2genes, glist); + if (count_mtx_priority != 0 && glist.size() > 1) intersect_genes_of_ecs(ecs, ec2genes_priority, glist); + int gn = glist.size(); + if (gn != 1) { + continue; + } } - of << n_rows << " " << (x+1) << " " << val << "\n"; + column_vp.push_back({ec,{v[i].count,COUNT_DEFAULT}}); + } + std::sort(column_vp.begin(), column_vp.end()); + size_t m = column_vp.size(); + for (size_t i = 0; i < m; ) { + size_t j = i+1; + double val = column_vp[i].second.first; + for (; j < m; j++) { + if (column_vp[i].first != column_vp[j].first) { + break; + } + val += column_vp[j].second.first; + } + auto which_mtx = intersect_ecs_with_subset_txs(column_vp[i].first, ecmap, tx_split_lookup); + auto& of_ = which_mtx == COUNT_DEFAULT ? of : (which_mtx == COUNT_SPLIT ? of_2 : of_A); + auto& n_entries_ = which_mtx == COUNT_DEFAULT ? n_entries : (which_mtx == COUNT_SPLIT ? n_entries_2 : n_entries_A); + of_ << n_rows << " " << (column_vp[i].first+1) << " " << val << "\n"; + n_entries_++; + i = j; // increment } }; - for (const auto& infn : opt.files) { + for (const auto& infn : opt.files) { std::streambuf *inbuf; std::ifstream inf; if (!opt.stream_in) { @@ -467,9 +616,10 @@ void bustools_count(Bustools_opt &opt) { // output whatever is in v if (!v.empty()) { if (!opt.count_collapse) { - write_barcode_matrix(v); + if (!opt.count_cm) write_barcode_matrix(v); + else write_barcode_matrix_mult(v); } else { - write_barcode_matrix_collapsed(v); + write_barcode_matrix_collapsed(v); // Same signature for count_cm and !count_cm } } v.clear(); @@ -481,9 +631,10 @@ void bustools_count(Bustools_opt &opt) { } if (!v.empty()) { if (!opt.count_collapse) { - write_barcode_matrix(v); + if (!opt.count_cm) write_barcode_matrix(v); + else write_barcode_matrix_mult(v); } else { - write_barcode_matrix_collapsed(v); + write_barcode_matrix_collapsed(v); // Same signature for count_cm and !count_cm } } @@ -500,9 +651,13 @@ void bustools_count(Bustools_opt &opt) { } of.close(); + if (count_split) { + of_2.close(); + of_A.close(); + } //Rewrite header in a way that works for both Windows and Linux - std::stringstream ss; + std::stringstream ss, ss_2, ss_A; ss << n_rows << " " << n_cols << " " << n_entries; std::string header = ss.str(); int hlen = header.size(); @@ -510,6 +665,22 @@ void bustools_count(Bustools_opt &opt) { of.open(mtx_ofn, std::ios::in | std::ios::out); of << headerComments << header; of.close(); + if (count_split) { + ss_2 << n_rows << " " << n_cols << " " << n_entries_2; + header = ss_2.str(); + hlen = header.size(); + header = header + std::string(66 - hlen, ' ') + '\n'; + of_2.open(mtx_ofn_split_2, std::ios::in | std::ios::out); + of_2 << headerComments << header; + of_2.close(); + ss_A << n_rows << " " << n_cols << " " << n_entries_A; + header = ss_A.str(); + hlen = header.size(); + header = header + std::string(66 - hlen, ' ') + '\n'; + of_A.open(mtx_ofn_split_A, std::ios::in | std::ios::out); + of_A << headerComments << header; + of_A.close(); + } // write updated ec file h.ecs = std::move(ecmap); @@ -519,11 +690,22 @@ void bustools_count(Bustools_opt &opt) { writeGenes(gene_ofn, genenames); } // write barcode file + bool write_prefix = false; std::ofstream bcof; bcof.open(barcodes_ofn); + uint64_t len_mask = ((1ULL << (2*bclen)) - 1); for (const auto &x : barcodes) { + if (x != (x & len_mask)) write_prefix = true; bcof << binaryToString(x, bclen) << "\n"; } + if (write_prefix) { + std::ofstream bcprefixof; + bcprefixof.open(barcodes_prefix_ofn); + for (const auto &x : barcodes) { + bcprefixof << binaryToString(x >> (2*bclen), 16) << "\n"; // Always make prefix length 16 + } + bcprefixof.close(); + } bcof.close(); //write histogram file @@ -612,274 +794,5 @@ void bustools_count(Bustools_opt &opt) { } void bustools_count_mult(Bustools_opt &opt) { - BUSHeader h; - size_t nr = 0; - size_t N = 100000; - uint32_t bclen = 0; - BUSData* p = new BUSData[N]; - - // read and parse the equivalence class files - - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; - std::vector> ecmap; - - std::unordered_map txnames; - parseTranscripts(opt.count_txp, txnames); - std::vector genemap(txnames.size(), -1); - std::unordered_map genenames; - parseGenes(opt.count_genes, txnames, genemap, genenames); - parseECs(opt.count_ecs, h); - ecmap = std::move(h.ecs); - ecmapinv.reserve(ecmap.size()); - for (int32_t ec = 0; ec < ecmap.size(); ec++) { - ecmapinv.insert({ecmap[ec], ec}); - } - std::vector> ec2genes; - create_ec2genes(ecmap, genemap, ec2genes); - - - std::ofstream of; - std::string mtx_ofn = opt.output + ".mtx"; - std::string barcodes_ofn = opt.output + ".barcodes.txt"; - std::string ec_ofn = opt.output + ".ec.txt"; - std::string gene_ofn = opt.output + ".genes.txt"; - of.open(mtx_ofn); - - // write out the initial header - of << "%%MatrixMarket matrix coordinate real general\n%\n"; - // number of genes - auto mat_header_pos = of.tellp(); - std::string dummy_header(66, '\n'); - for (int i = 0; i < 33; i++) { - dummy_header[2*i] = '%'; - } - of.write(dummy_header.c_str(), dummy_header.size()); - - - size_t n_cols = 0; - size_t n_rows = 0; - size_t n_entries = 0; - std::vector v; - v.reserve(N); - uint64_t current_bc = 0xFFFFFFFFFFFFFFFFULL; - //temporary data - std::vector ecs; - std::vector glist; - ecs.reserve(100); - std::vector u; - u.reserve(100); - std::vector column_v; - std::vector> column_vp; - if (!opt.count_collapse) { - column_vp.reserve(N); - } else { - column_vp.reserve(N); - glist.reserve(100); - } - //barcodes - std::vector barcodes; - int bad_count = 0; - int compacted = 0; - int rescued = 0; - - - auto write_barcode_matrix = [&](const std::vector &v) { - if(v.empty()) { - return; - } - column_vp.resize(0); - n_rows+= 1; - - barcodes.push_back(v[0].barcode); - double val = 0.0; - size_t n = v.size(); - - for (size_t i = 0; i < n; i++) { - int32_t ec = v[i].ec; - if (!opt.count_gene_multimapping) { - ecs.resize(0); - ecs.push_back(ec); - intersect_genes_of_ecs(ecs, ec2genes, glist); - int gn = glist.size(); - if (gn != 1) { - continue; - } - } - column_vp.push_back({ec,v[i].count}); - } - std::sort(column_vp.begin(), column_vp.end()); - size_t m = column_vp.size(); - for (size_t i = 0; i < m; ) { - size_t j = i+1; - double val = column_vp[i].second; - for (; j < m; j++) { - if (column_vp[i].first != column_vp[j].first) { - break; - } - val += column_vp[j].second; - } - n_entries++; - of << n_rows << " " << (column_vp[i].first+1) << " " << val << "\n"; - i = j; // increment - } - }; - - auto write_barcode_matrix_collapsed = [&](const std::vector &v) { - if(v.empty()) { - return; - } - column_vp.resize(0); - n_rows+= 1; - - barcodes.push_back(v[0].barcode); - double val = 0.0; - size_t n = v.size(); - - for (size_t i = 0; i < n; i++) { - ecs.resize(0); - ecs.push_back(v[i].ec); - - intersect_genes_of_ecs(ecs, ec2genes, glist); - int gn = glist.size(); - if (gn > 0) { - if (opt.count_gene_multimapping) { - for (auto x : glist) { - column_vp.push_back({x, v[i].count/gn}); - } - } else { - if (gn==1) { - column_vp.push_back({glist[0],v[i].count}); - } - } - } - } - - std::sort(column_vp.begin(), column_vp.end()); - size_t m = column_vp.size(); - std::unordered_map col_map(m); - std::vector cols; - - for (size_t i = 0; i < m; ) { - size_t j = i+1; - double val = column_vp[i].second; - for (; j < m; j++) { - if (column_vp[i].first != column_vp[j].first) { - break; - } - val += column_vp[j].second; - } - col_map.insert({column_vp[i].first,val}); - cols.push_back(column_vp[i].first); - - n_entries++; - - i = j; // increment - } - - - - for (const auto &x : cols) { - double val = 0; - auto it = col_map.find(x); - if (it != col_map.end()) { - val = it->second; - } - of << n_rows << " " << (x+1) << " " << val << "\n"; - } - - }; - - for (const auto& infn : opt.files) { - std::streambuf *inbuf; - std::ifstream inf; - if (!opt.stream_in) { - inf.open(infn.c_str(), std::ios::binary); - inbuf = inf.rdbuf(); - } else { - inbuf = std::cin.rdbuf(); - } - std::istream in(inbuf); - - parseHeader(in, h); - bclen = h.bclen; - - int rc = 0; - while (true) { - in.read((char*)p, N*sizeof(BUSData)); - size_t rc = in.gcount() / sizeof(BUSData); - nr += rc; - if (rc == 0) { - break; - } - - - for (size_t i = 0; i < rc; i++) { - if (p[i].barcode != current_bc) { - // output whatever is in v - if (!v.empty()) { - if (!opt.count_collapse) { - write_barcode_matrix(v); - } else { - write_barcode_matrix_collapsed(v); - } - } - v.clear(); - current_bc = p[i].barcode; - } - v.push_back(p[i]); - - } - } - if (!v.empty()) { - if (!opt.count_collapse) { - write_barcode_matrix(v); - } else { - write_barcode_matrix_collapsed(v); - } - } - - if (!opt.stream_in) { - inf.close(); - } - } - delete[] p; p = nullptr; - - if (!opt.count_collapse) { - n_cols = ecmap.size(); - } else { - n_cols = genenames.size(); - } - - of.close(); - - std::stringstream ss; - ss << n_rows << " " << n_cols << " " << n_entries << "\n"; - std::string header = ss.str(); - int hlen = header.size(); - assert(hlen < 66); - of.open(mtx_ofn, std::ios::binary | std::ios::in | std::ios::out); - of.seekp(mat_header_pos); - of.write("%",1); - of.write(std::string(66-hlen-2,' ').c_str(),66-hlen-2); - of.write("\n",1); - of.write(header.c_str(), hlen); - of.close(); - - // write updated ec file - h.ecs = std::move(ecmap); - if (!opt.count_collapse) { - writeECs(ec_ofn, h); - } else { - writeGenes(gene_ofn, genenames); - } - // write barcode file - std::ofstream bcof; - bcof.open(barcodes_ofn); - for (const auto &x : barcodes) { - bcof << binaryToString(x, bclen) << "\n"; - } - bcof.close(); - //std::cerr << "bad counts = " << bad_count <<", rescued =" << rescued << ", compacted = " << compacted << std::endl; - - //std::cerr << "Read in " << nr << " BUS records" << std::endl; + bustools_count(opt); } diff --git a/src/bustools_main.cpp b/src/bustools_main.cpp index ab954a8..fe85131 100644 --- a/src/bustools_main.cpp +++ b/src/bustools_main.cpp @@ -328,7 +328,7 @@ void parse_ProgramOptions_capture(int argc, char **argv, Bustools_opt &opt) void parse_ProgramOptions_count(int argc, char **argv, Bustools_opt &opt) { - const char *opt_string = "o:g:e:t:md:"; + const char *opt_string = "o:g:e:t:md:s:"; int gene_flag = 0; int umigene_flag = 0; int em_flag = 0; @@ -348,6 +348,7 @@ void parse_ProgramOptions_count(int argc, char **argv, Bustools_opt &opt) {"hist", no_argument, &hist_flag, 1}, {"downsample", required_argument, 0, 'd'}, {"rawcounts", no_argument, &rawcounts_flag, 1}, + {"split", required_argument, 0, 's'}, {0, 0, 0, 0}}; int option_index = 0, c; @@ -375,24 +376,23 @@ void parse_ProgramOptions_count(int argc, char **argv, Bustools_opt &opt) case 'm': opt.count_gene_multimapping = true; break; + case 's': + opt.count_split = optarg; + break; default: break; } } - if (gene_flag) - { + if (gene_flag) { opt.count_collapse = true; } - if (umigene_flag) - { + if (umigene_flag) { opt.umi_gene_collapse = true; } - if (em_flag) - { + if (em_flag) { opt.count_em = true; } - if (cm_flag) - { + if (cm_flag) { opt.count_cm = true; } if (hist_flag) { @@ -493,13 +493,14 @@ void parse_ProgramOptions_predict(int argc, char **argv, Bustools_opt& opt) { void parse_ProgramOptions_dump(int argc, char **argv, Bustools_opt &opt) { - const char *opt_string = "o:pfd"; + const char *opt_string = "o:pfda"; static struct option long_options[] = { {"output", required_argument, 0, 'o'}, {"pipe", no_argument, 0, 'p'}, {"flags", no_argument, 0, 'f'}, {"pad", no_argument, 0, 'd'}, + {"showAll", no_argument, 0, 'a'}, {0, 0, 0, 0}}; int option_index = 0, c; @@ -521,6 +522,9 @@ void parse_ProgramOptions_dump(int argc, char **argv, Bustools_opt &opt) case 'd': opt.text_dumppad = true; break; + case 'a': + opt.text_showall = true; + break; default: break; } @@ -576,13 +580,14 @@ void parse_ProgramOptions_fromtext(int argc, char **argv, Bustools_opt& opt) { void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt) { - const char *opt_string = "o:w:d:sp"; + const char *opt_string = "o:w:d:spr"; static struct option long_options[] = { {"output", required_argument, 0, 'o'}, {"whitelist", required_argument, 0, 'w'}, {"dump", required_argument, 0, 'd'}, {"split", no_argument, 0, 's'}, {"pipe", no_argument, 0, 'p'}, + {"replace", no_argument, 0, 'r'}, {0, 0, 0, 0}}; int option_index = 0, c; @@ -608,6 +613,9 @@ void parse_ProgramOptions_correct(int argc, char **argv, Bustools_opt &opt) case 'p': opt.stream_out = true; break; + case 'r': + opt.barcode_replacement = true; + break; default: break; } @@ -1604,7 +1612,7 @@ bool check_ProgramOptions_correct(Bustools_opt &opt) if (opt.whitelist.size() == 0) { - std::cerr << "Error: Missing whitelist file" << std::endl; + std::cerr << "Error: Missing on-list file" << std::endl; ret = false; } else @@ -1632,95 +1640,75 @@ bool check_ProgramOptions_count(Bustools_opt &opt) bool ret = true; // check for output directory - if (opt.output.empty()) - { + if (opt.output.empty()) { std::cerr << "Error: Missing output directory" << std::endl; ret = false; } - else - { + else { bool isDir = false; - if (checkDirectoryExists(opt.output)) - { + if (checkDirectoryExists(opt.output)) { isDir = true; } - else - { - if (opt.output.at(opt.output.size() - 1) == '/') - { - if (my_mkdir(opt.output.c_str(), 0777) == -1) - { + else { + if (opt.output.at(opt.output.size() - 1) == '/') { + if (my_mkdir(opt.output.c_str(), 0777) == -1) { std::cerr << "Error: could not create directory " << opt.output << std::endl; ret = false; } - else - { + else { isDir = true; } } } - if (isDir) - { + if (isDir) { opt.output += "output"; } } - if (opt.count_em && opt.count_gene_multimapping) - { + if (opt.count_em && opt.count_gene_multimapping) { std::cerr << "Error: EM algorithm and counting multimapping reads are incompatible" << std::endl; ret = false; } - if (opt.count_em && opt.count_cm) - { + if (opt.count_em && opt.count_cm) { std::cerr << "Error: EM algorithm and counting multiplicites are incompatible" << std::endl; ret = false; } - if (opt.umi_gene_collapse && opt.count_cm) - { + if (opt.umi_gene_collapse && opt.count_cm) { std::cerr << "Error: Gene-level collapsing of UMIs and counting multiplicites are incompatible" << std::endl; ret = false; } - if (opt.umi_gene_collapse && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0)) - { + if (opt.umi_gene_collapse && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0)) { std::cerr << "Error: Gene-level collapsing of UMIs is currently incompatible with --hist, --downsample, or --rawcounts" << std::endl; ret = false; } - if (opt.count_cm && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0)) - { + if (opt.count_cm && (opt.count_raw_counts || opt.count_gen_hist || opt.count_downsampling_factor != 1.0)) { std::cerr << "Error: Counting multiplicites is incompatible with --hist, --downsample, or --rawcounts" << std::endl; ret = false; } - if (opt.count_raw_counts && opt.count_em) - { + if (opt.count_raw_counts && opt.count_em) { std::cerr << "Error: Counting raw counts are not supported for the EM algorithm" << std::endl; ret = false; } - if (opt.count_raw_counts && !opt.count_collapse) - { + if (opt.count_raw_counts && !opt.count_collapse) { std::cerr << "Error: Raw counts are currently only supported for gene counting, not ec counting." << std::endl; ret = false; } - if (opt.files.size() == 0) - { + if (opt.files.size() == 0) { std::cerr << "Error: Missing BUS input files" << std::endl; ret = false; } - else - { - if (!opt.stream_in) - { - for (const auto &it : opt.files) - { - if (!checkFileExists(it)) - { + else { + if (!opt.stream_in) { + for (const auto &it : opt.files) { + if (!checkFileExists(it)) { std::cerr << "Error: File not found, " << it << std::endl; ret = false; } @@ -1728,13 +1716,11 @@ bool check_ProgramOptions_count(Bustools_opt &opt) } } - if (opt.count_genes.size() == 0) - { + if (opt.count_genes.size() == 0) { std::cerr << "Error: missing gene mapping file" << std::endl; ret = false; } - else - { + else { if (!checkFileExists(opt.count_genes)) { std::cerr << "Error: File not found " << opt.count_genes << std::endl; @@ -1742,13 +1728,11 @@ bool check_ProgramOptions_count(Bustools_opt &opt) } } - if (opt.count_ecs.size() == 0) - { + if (opt.count_ecs.size() == 0) { std::cerr << "Error: missing equivalence class mapping file" << std::endl; ret = false; } - else - { + else { if (!checkFileExists(opt.count_ecs)) { std::cerr << "Error: File not found " << opt.count_ecs << std::endl; @@ -1756,20 +1740,28 @@ bool check_ProgramOptions_count(Bustools_opt &opt) } } - if (opt.count_txp.size() == 0) - { + if (opt.count_txp.size() == 0) { std::cerr << "Error: missing transcript name file" << std::endl; ret = false; } - else - { - if (!checkFileExists(opt.count_txp)) - { + else { + if (!checkFileExists(opt.count_txp)) { std::cerr << "Error: File not found " << opt.count_txp << std::endl; ret = false; } } + if (opt.count_split.size() != 0) { + if (!checkFileExists(opt.count_split)) { + std::cerr << "Error: File not found " << opt.count_split << std::endl; + ret = false; + } + if (opt.count_em) { + std::cerr << "Cannot use -s with --em" << std::endl; + ret = false; + } + } + return ret; } @@ -2619,6 +2611,7 @@ void Bustools_dump_Usage() << "-f, --flags Write the flag column" << std::endl << "-d, --pad Write the pad column" << std::endl << "-p, --pipe Write to standard output" << std::endl + << "-a, --showAll Show hidden metadata in barcodes" << std::endl << std::endl; } @@ -2637,10 +2630,10 @@ void Bustools_correct_Usage() << std::endl << "Options: " << std::endl << "-o, --output File for corrected bus output" << std::endl - << "-w, --whitelist File of whitelisted barcodes to correct to" << std::endl + << "-w, --whitelist File of on-list barcodes to correct to" << std::endl << "-p, --pipe Write to standard output" << std::endl << "-d, --dump Dump uncorrected to corrected barcodes (optional)" << std::endl - << "-s, --split Split the whitelist and correct each half independently (optional)" << std::endl + << "-r, --replace The file of on-list barcodes is a barcode replacement file" << std::endl << std::endl; } @@ -2657,6 +2650,7 @@ void Bustools_count_Usage() << " --umi-gene Perform gene-level collapsing of UMIs" << std::endl << " --em Estimate gene abundances using EM algorithm" << std::endl << " --cm Count multiplicites instead of UMIs" << std::endl + << "-s, --split Split output matrix in two (plus ambiguous) based on transcripts supplied in this file" << std::endl << "-m, --multimapping Include bus records that pseudoalign to multiple genes" << std::endl << " --hist Output copy per UMI histograms for all genes" << std::endl << "-d --downsample Specify a factor between 0 and 1 specifying how much to downsample" << std::endl diff --git a/src/bustools_mash.cpp b/src/bustools_mash.cpp index d9c3d17..1df9702 100644 --- a/src/bustools_mash.cpp +++ b/src/bustools_mash.cpp @@ -12,7 +12,7 @@ #include "bustools_merge.h" -inline std::vector get_tids(const BUSHeader &oh, const std::unordered_map, int32_t, SortedVectorHasher> &ecmapinv, const int32_t &eid) +inline std::vector get_tids(const BUSHeader &oh, const u_map_, int32_t, SortedVectorHasher> &ecmapinv, const int32_t &eid) { std::vector tids = oh.ecs[eid]; @@ -56,7 +56,7 @@ void bustools_mash(const Bustools_opt &opt) std::cerr << "[info] parsed output.bus files" << std::endl; // parse the transcripts.txt - std::unordered_map txn_tid; + u_map_ txn_tid; std::vector> tids_per_file; // list of tids as they occur for each file std::vector tids; // a vector of tids int32_t tid = 0; @@ -101,12 +101,14 @@ void bustools_mash(const Bustools_opt &opt) oh.bclen = vh[0].bclen; oh.umilen = vh[0].umilen; - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; // set{tids} (ec) to eid it came from + u_map_, int32_t, SortedVectorHasher> ecmapinv; // set{tids} (ec) to eid it came from for (int32_t i = 0; i < tid; i++) { - oh.ecs.push_back({i}); - ecmapinv.insert({{i}, i}); + std::vector tmp_vec; + tmp_vec.push_back(i); + oh.ecs.push_back(tmp_vec); + ecmapinv.insert({tmp_vec, i}); } std::vector> eids_per_file; diff --git a/src/bustools_merge.cpp b/src/bustools_merge.cpp index f330efe..c89eae3 100644 --- a/src/bustools_merge.cpp +++ b/src/bustools_merge.cpp @@ -57,7 +57,7 @@ void bustools_merge_different_index(const Bustools_opt &opt) std::ifstream ifn(opt.count_txp); std::string txn; int32_t tid; - std::unordered_map txn_tid; + u_map_ txn_tid; std::vector tids; // insert tids into a vector @@ -81,7 +81,7 @@ void bustools_merge_different_index(const Bustools_opt &opt) BUSHeader h, bh; parseECs(opt.count_ecs, h); // put the ecs into a ecmap inv - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + u_map_, int32_t, SortedVectorHasher> ecmapinv; for (std::size_t ec = 0; ec < h.ecs.size(); ec++) { diff --git a/src/bustools_project.cpp b/src/bustools_project.cpp index c5b1979..6d1e195 100644 --- a/src/bustools_project.cpp +++ b/src/bustools_project.cpp @@ -20,7 +20,7 @@ void bustools_project(Bustools_opt &opt) { size_t stat_map = 0; size_t stat_unmap = 0; - std::unordered_map project_map; + u_map_ project_map; /* Load the map into project_map variable parse bus records and map each object (barcode, umi) with project_map @@ -175,18 +175,18 @@ void bustools_project(Bustools_opt &opt) { } if (opt.type == PROJECT_TX) { std::ofstream of; - std::unordered_map txnames; + u_map_ txnames; parseTranscripts(opt.count_txp, txnames); std::vector genemap(txnames.size(), -1); - std::unordered_map genenames; + u_map_ genenames; parseGenes(opt.map, txnames, genemap, genenames); std::vector genenamesinv(genenames.size(), ""); for (const auto &gene : genenames) { genenamesinv[gene.second] = gene.first; } - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + u_map_, int32_t, SortedVectorHasher> ecmapinv; std::vector> ecmap; parseECs(opt.count_ecs, h); ecmap = std::move(h.ecs); @@ -199,7 +199,7 @@ void bustools_project(Bustools_opt &opt) { create_ec2genes(ecmap, genemap, ec2genes); std::vector> geneEc2genes = ec2genes; - std::unordered_map, int32_t, SortedVectorHasher> geneEc2genesinv; + u_map_, int32_t, SortedVectorHasher> geneEc2genesinv; std::sort(geneEc2genes.begin(), geneEc2genes.end()); auto firstNonempty = geneEc2genes.begin(); while (firstNonempty->size() == 0 && firstNonempty != geneEc2genes.end()) { @@ -284,7 +284,7 @@ void bustools_project(Bustools_opt &opt) { BUSData *p = new BUSData[N]; BUSData currRec; // Gene EC --> counts for current barcode/UMI pair - std::unordered_map counts; + u_map_ counts; while (true) { in.read((char*) p, N * sizeof(BUSData)); diff --git a/src/bustools_sort.cpp b/src/bustools_sort.cpp index 0f2323c..adc5e7e 100644 --- a/src/bustools_sort.cpp +++ b/src/bustools_sort.cpp @@ -4,11 +4,14 @@ #include #include #include +#include #include "Common.hpp" #include "BUSData.h" #include "bustools_sort.h" +#include + #define TP std::pair //This code is for automatically creating the tmp directory supplied if it doesn't exist @@ -16,9 +19,61 @@ //#include //once filesystem is acceptable for minGW, switch to that #include "windows.h" //Needed for CreateDirectory + + void EnsureWindowsTempDirectoryExists(const Bustools_opt &opt) { + //Make sure to create the tmp directory if it doesn't exist - writing temporary files fails otherwise in Windows + //First get the directory - in theory, opt.temp_files can look like "tmp/x_" or just "x_" (or even nothing) + //so we should find the last slash and make sure that directory exists + + std::size_t ind = opt.temp_files.rfind('/'); + std::size_t ind2 = opt.temp_files.rfind('\\'); + if (ind == std::string::npos) + { + ind = ind2; + } + else if (ind2 != std::string::npos) + { + //both valid, take the largest value (representing the last slash) + ind = std::max(ind, ind2); + } + if (ind != std::string::npos) + { + auto dirName = opt.temp_files.substr(0, ind); + //When our MinGW builds support c++17, change to std::filesystem + + //std::filesystem::path filepath = dirName; + //if (!std::filesystem::is_directory(filepath)) + //{ + // std::filesystem::create_directory(filepath); + //} + CreateDirectory(dirName.c_str(), NULL); //This will do nothing if the directory exists already + } + } + + //There is a bug in Windows, where bustools sort fails. The problem is that + //gcount for some reason fails here if too much is read and returns 0, even though + //it succeeds. Could perhaps be a 32 bit issue somewhere, does size_t become 32 bits? + //Anyway, this is a workaround that fixes the issue - does the same as the flag -m 100000000. + //An interesting observation is that opt.max_memory is set to 1 << 32, which will become exactly + //zero if truncated to 32 bits... + size_t WindowsMaxMemory(size_t mem) { + + const size_t win_mem_max = 1e8; + if (mem > win_mem_max) { + mem = win_mem_max; + } + return mem; + } +#else + void EnsureWindowsTempDirectoryExists(const Bustools_opt &opt) {} + size_t WindowsMaxMemory(size_t mem) {return mem;} + #endif + + + inline bool cmp1(const BUSData &a, const BUSData &b) { if (a.barcode == b.barcode) @@ -328,22 +383,132 @@ inline bool ncmp5(const TP &a, const TP &b) } }; + +void sort_bus_array(BUSData* busdata, size_t N, const int t, bool (*cmp)(const BUSData &, const BUSData &)) { + //std::sort(busdata, busdata + N, cmp); + if (t > 1 && N > 100000) { + const size_t s = 256; + std::vector samples, pivots; + + // samples = drawn from 0, s, 2s, ... , t*s + samples.reserve(s*t); + for (int i = 0; i < s*t; ++i) { + samples.push_back(busdata[i * (N / (s*t))]); + } + std::sort(samples.begin(), samples.end(), cmp); + + pivots.reserve(t-1); + // piviots are samples s, 2s, ... , (t-1)*s + for (int i = 1; i < t; ++i) { + pivots.push_back(samples[i * s]); + + //std::cerr << "pivot " << i << " = " << binaryToString(pivots[i-1].barcode, 16) << std::endl; + } + + // buckets are locations of pivots after partitioning + // partition i is between buckets[i] and buckets[i+1] + std::vector buckets(t+1, 0); + buckets[0] = 0; + buckets[t] = N; + + + double partition_time = 0; + clock_t start, end; + start = clock(); + /* + for (int i = 0; i < t-1; i++) { + BUSData p = pivots[i]; + //std::cerr << "partitioning around " << binaryToString(p.barcode, 16) << std::endl; + auto mid = std::partition(busdata + buckets[i], busdata + N, [&p, &cmp](const BUSData &a) { return cmp(a, p); }) - busdata; + buckets[i+1] = mid; + //std::cerr << "bucket " << i << " has " << buckets[i+1] - buckets[i] << " elements, mid = " << mid << std::endl; + } + //std::cerr << "bucket " << t-1 << " has " << buckets[t] - buckets[t-1] << " elements" << std::endl; + */ + + // partition the busdata based on the middle pivot + std::function mid_partition = [&](int i, int j) { + if (j-i <= 1) { + return; + } + size_t k = i + (j-i)/2; + //std::cerr << "partitioning " << i << " to " << j << " with middle " << k << std::endl; + //std::cerr << "buckets i and j are " << buckets[i] << " and " << buckets[j] << std::endl; + BUSData p = pivots[k-1]; + //std::cerr << "pivot element is " << binaryToString(p.barcode, 16) << std::endl; + buckets[k] = std::partition(busdata + buckets[i], busdata + buckets[j], [&p, &cmp](const BUSData &a) { return cmp(a, p); }) - busdata; + //std::cerr << "bucket " << k << " is " << buckets[k] << std::endl; + mid_partition(i, k); + mid_partition(k, j); + }; + + mid_partition(0, t); + + + /* + // verify that the pivots are sorted + for (int i = 0; i < t-2; i++) { + if (!cmp(pivots[i], pivots[i+1])) { + std::cerr << "pivot " << i << " is not smaller than pivot " << i+1 << std::endl; + exit(1); + } + } + + for (int i = 0; i < t; i++) { + std::cerr << "bucket " << i << " at " << buckets[i] << " has " << buckets[i+1] - buckets[i] << " elements" << std::endl; + } + + //verify that each partition is smaller than the pivot + for (int i = 0; i < t; i++) { + for (size_t j = buckets[i]; j < buckets[i+1]; j++) { + if (i < t-1 && !cmp(busdata[j], pivots[i])) { + std::cerr << "partition " << i << " has an element larger than the pivot" << std::endl; + std::cerr << "element " << j << " = " << binaryToString(busdata[j].barcode, 16) << std::endl; + std::cerr << "pivot " << i << " = " << binaryToString(pivots[i].barcode, 16) << std::endl; + + exit(1); + } + if (i > 0 && cmp(busdata[j], pivots[i-1])) { + std::cerr << "partition " << i << " has an element smaller than the next pivot" << std::endl; + std::cerr << "element " << j << " = " << binaryToString(busdata[j].barcode, 16) << std::endl; + std::cerr << "pivot " << i-1 << " = " << binaryToString(pivots[i-1].barcode, 16) << std::endl; + exit(1); + } + } + } + */ + + + end = clock(); + partition_time += ((double) (end - start)) / CLOCKS_PER_SEC; + std::cerr << "partition time: " << partition_time << "s" << std::endl; + + + + // sort each bucket + std::vector workers; + for (int i = 0; i < t; ++i) { + workers.push_back(std::thread([&busdata, &buckets, &cmp, i]() { + //std::cerr << "sorting bucket " << i << " with " << buckets[i] << " to " << buckets[i+1]<< std::endl; + std::sort(busdata + buckets[i], busdata + buckets[i+1], cmp); + })); + } + + for (auto &w : workers) { + w.join(); + } + + + } else { + std::sort(busdata, busdata + N, cmp); + } + +} + void bustools_sort(const Bustools_opt &opt) { - auto mem = opt.max_memory; - //There is a bug in Windows, where bustools sort fails. The problem is that - //gcount for some reason fails here if too much is read and returns 0, even though - //it succeeds. Could perhaps be a 32 bit issue somewhere, does size_t become 32 bits? - //Anyway, this is a workaround that fixes the issue - does the same as the flag -m 100000000. - //An interesting observation is that opt.max_memory is set to 1 << 32, which will become exactly - //zero if truncated to 32 bits... -#if defined(__MINGW32__) || defined(_MSC_VER) - const size_t win_mem_max = 1e8; - if (mem > win_mem_max) - { - mem = win_mem_max; - } -#endif + auto mem = WindowsMaxMemory(opt.max_memory); + BUSHeader h; size_t N = mem / sizeof(BUSData); BUSData *p = new BUSData[N]; @@ -381,49 +546,77 @@ void bustools_sort(const Bustools_opt &opt) exit(1); } -#if defined(__MINGW32__) || defined(_MSC_VER) - //Make sure to create the tmp directory if it doesn't exist - writing temporary files fails otherwise in Windows - //First get the directory - in theory, opt.temp_files can look like "tmp/x_" or just "x_" (or even nothing) - //so we should find the last slash and make sure that directory exists - std::size_t ind = opt.temp_files.rfind('/'); - std::size_t ind2 = opt.temp_files.rfind('\\'); - if (ind == std::string::npos) - { - ind = ind2; - } - else if (ind2 != std::string::npos) - { - //both valid, take the largest value (representing the last slash) - ind = std::max(ind, ind2); - } - if (ind != std::string::npos) - { - auto dirName = opt.temp_files.substr(0, ind); - //When our MinGW builds support c++17, change to std::filesystem - - //std::filesystem::path filepath = dirName; - //if (!std::filesystem::is_directory(filepath)) - //{ - // std::filesystem::create_directory(filepath); - //} - CreateDirectory(dirName.c_str(), NULL); //This will do nothing if the directory exists already - } -#endif + - size_t sc = 0; + size_t sc = 0; // number of records read + double sorting_time = 0; int tmp_file_no = 0; - for (const auto &infn : opt.files) - { + + // only use a single buffer if we are reading from stdin or if we have a single file + bool all_in_buffer = opt.stream_in || opt.files.size() == 1; + + + const auto collapse_and_write = [&](BUSData *p, size_t rc, std::ostream &outf) { + size_t batch = 1<<20; + std::vector v; + v.reserve(batch); + + for (size_t i = 0; i < rc;) { + size_t j = i + 1; + uint32_t c = p[i].count; + auto ec = p[i].ec; + for (; j < rc; j++) { + if (p[i].barcode == p[j].barcode && p[i].UMI == p[j].UMI && p[i].ec == p[j].ec && p[i].flags == p[j].flags && p[i].pad == p[j].pad) { + c += p[j].count; + } else { + break; + } + } + // merge identical things + p[i].count = c; + + // push back p to the vector + v.push_back(p[i]); + + if (v.size() >= batch) { + outf.write((char *)v.data(), v.size() * sizeof(BUSData)); + v.clear(); + } + + //outf.write((char *)(&(p[i])), sizeof(p[i])); + // increment + i = j; + } + if (v.size() > 0) { + outf.write((char *)v.data(), v.size() * sizeof(BUSData)); + v.clear(); + } + }; + + // open the correct output stream + std::ofstream of; + std::streambuf *buf = nullptr; + if (!opt.stream_out) { + of.open(opt.output, std::ios::out | std::ios::binary); + buf = of.rdbuf(); + } else { + buf = std::cout.rdbuf(); + } + std::ostream busf_out(buf); + + // measure time spent reading input + clock_t start,end; + double reading_time = 0; + double writing_time = 0; + + for (const auto &infn : opt.files) { std::streambuf *inbuf; std::ifstream inf; - if (!opt.stream_in) - { + if (!opt.stream_in) { inf.open(infn.c_str(), std::ios::binary); inbuf = inf.rdbuf(); - } - else - { + } else { inbuf = std::cin.rdbuf(); } std::istream in(inbuf); @@ -432,163 +625,163 @@ void bustools_sort(const Bustools_opt &opt) int rc = 1; - while (in.good()) - { - // read as much as we can + + while (in.good()) { + + start = clock(); in.read((char *)p, N * sizeof(BUSData)); size_t rc = in.gcount() / sizeof(BUSData); - if (rc == 0) - { + end = clock(); + reading_time += ((double) (end - start)) / CLOCKS_PER_SEC; + + // no records read, we are done + if (rc == 0) { break; } + + // records did not fit in buffer + if (rc >= N) { + all_in_buffer = false; + } + // now sort the data - std::sort(p, p + rc, cmp); - sc += rc; + start = clock(); + //std::sort(p, p + rc, cmp); + sort_bus_array(p, rc, opt.threads, cmp); + end = clock(); + sorting_time += ((double) (end - start)) / CLOCKS_PER_SEC; - // write the output - std::ofstream outf(opt.temp_files + std::to_string(tmp_file_no), std::ios::binary); - writeHeader(outf, h); + sc += rc; - for (size_t i = 0; i < rc;) - { - size_t j = i + 1; - uint32_t c = p[i].count; - auto ec = p[i].ec; - for (; j < rc; j++) - { - if (p[i].barcode == p[j].barcode && p[i].UMI == p[j].UMI && p[i].ec == p[j].ec && p[i].flags == p[j].flags && p[i].pad == p[j].pad) - { - c += p[j].count; - } - else - { - break; - } - } - // merge identical things - p[i].count = c; - outf.write((char *)(&(p[i])), sizeof(p[i])); - // increment - i = j; + if (all_in_buffer) { + std::cerr << " all fits in buffer" << std::endl; + // single file or stream, all data fits in buffer, write directly to output + start = clock(); + writeHeader(busf_out, h); + collapse_and_write(p, rc, busf_out); + end = clock(); + writing_time = ((double) (end - start)) / CLOCKS_PER_SEC; + } else { + // need to sort in chunks + // write the output + std::ofstream outf(opt.temp_files + std::to_string(tmp_file_no), std::ios::binary); + writeHeader(outf, h); + + collapse_and_write(p, rc, outf); + + outf.close(); + tmp_file_no++; } - - outf.close(); - tmp_file_no++; + } } delete[] p; p = nullptr; std::cerr << "Read in " << sc << " BUS records" << std::endl; + + std::cerr << "reading time " << reading_time << "s" << std::endl; + std::cerr << "sorting time " << sorting_time << "s" << std::endl; + std::cerr << "writing time " << writing_time << "s" << std::endl; - std::streambuf *buf = nullptr; - std::ofstream of; - if (!opt.stream_out) - { - of.open(opt.output, std::ios::out | std::ios::binary); - buf = of.rdbuf(); - } - else - { - buf = std::cout.rdbuf(); - } - std::ostream busf_out(buf); - writeHeader(busf_out, h); + + if (!all_in_buffer) { + writeHeader(busf_out, h); - // todo: skip writing to disk if it fits in memory - if (tmp_file_no == 1) - { - size_t M = N / 8; - p = new BUSData[M]; - std::ifstream in(opt.temp_files + "0", std::ios::binary); - BUSHeader tmp; - parseHeader(in, tmp); - while (in.good()) + if (tmp_file_no == 1) { - // read as much as we can - in.read((char *)p, M * sizeof(BUSData)); - size_t rc = in.gcount() / sizeof(BUSData); - if (rc == 0) + size_t M = N / 8; + p = new BUSData[M]; + std::ifstream in(opt.temp_files + "0", std::ios::binary); + BUSHeader tmp; + parseHeader(in, tmp); + while (in.good()) { - break; + // read as much as we can + in.read((char *)p, M * sizeof(BUSData)); + size_t rc = in.gcount() / sizeof(BUSData); + if (rc == 0) + { + break; + } + busf_out.write((char *)p, rc * sizeof(BUSData)); } - busf_out.write((char *)p, rc * sizeof(BUSData)); - } - in.close(); - std::remove((opt.temp_files + "0").c_str()); - } - else - { - // TODO: test if replacing with k-way merge is better - // adapted from https://github.com/arq5x/kway-mergesort/blob/master/kwaymergesort.h - int k = tmp_file_no; - size_t M = N / (k); - //std::memset(p, 0, N*sizeof(BUSData)); - std::vector bf(k); - for (int i = 0; i < k; i++) - { - bf[i].open((opt.temp_files + std::to_string(i)).c_str(), std::ios::binary); - BUSHeader tmp; - parseHeader(bf[i], tmp); + in.close(); + std::remove((opt.temp_files + "0").c_str()); } - - std::priority_queue, std::function> pq(ncmp); - BUSData t; - for (int i = 0; i < k; i++) - { - bf[i].read((char *)&t, sizeof(t)); - pq.push({t, i}); - } - - BUSData curr = pq.top().first; - curr.count = 0; // we'll count this again in the first loop - while (!pq.empty()) + else { - TP min = pq.top(); + // TODO: test if replacing with k-way merge is better + // adapted from https://github.com/arq5x/kway-mergesort/blob/master/kwaymergesort.h + int k = tmp_file_no; + size_t M = N / (k); + //std::memset(p, 0, N*sizeof(BUSData)); + std::vector bf(k); + for (int i = 0; i < k; i++) + { + bf[i].open((opt.temp_files + std::to_string(i)).c_str(), std::ios::binary); + BUSHeader tmp; + parseHeader(bf[i], tmp); + } - pq.pop(); - // process the data - BUSData &m = min.first; - int i = min.second; - if (m.barcode == curr.barcode && m.UMI == curr.UMI && m.ec == curr.ec && m.flags == curr.flags && m.pad == curr.pad) + std::priority_queue, std::function> pq(ncmp); + BUSData t; + for (int i = 0; i < k; i++) { - // same data, increase count - curr.count += m.count; + bf[i].read((char *)&t, sizeof(t)); + pq.push({t, i}); } - else + + BUSData curr = pq.top().first; + curr.count = 0; // we'll count this again in the first loop + while (!pq.empty()) { + TP min = pq.top(); - // new data let's output curr, new curr is m - if (curr.count != 0) + pq.pop(); + // process the data + BUSData &m = min.first; + int i = min.second; + if (m.barcode == curr.barcode && m.UMI == curr.UMI && m.ec == curr.ec && m.flags == curr.flags && m.pad == curr.pad) { - busf_out.write((char *)&curr, sizeof(curr)); + // same data, increase count + curr.count += m.count; } - curr = m; - } - // read next from stream - if (bf[i].good()) - { - bf[i].read((char *)&t, sizeof(t)); - if (bf[i].gcount() > 0) + else + { + + // new data let's output curr, new curr is m + if (curr.count != 0) + { + busf_out.write((char *)&curr, sizeof(curr)); + } + curr = m; + } + // read next from stream + if (bf[i].good()) { - pq.push({t, i}); + bf[i].read((char *)&t, sizeof(t)); + if (bf[i].gcount() > 0) + { + pq.push({t, i}); + } } } - } - if (curr.count > 0) - { - // write out remaining straggler - busf_out.write((char *)&curr, sizeof(curr)); - } + if (curr.count > 0) + { + // write out remaining straggler + busf_out.write((char *)&curr, sizeof(curr)); + } - // remove intermediary files - for (int i = 0; i < k; i++) - { - bf[i].close(); - std::remove((opt.temp_files + std::to_string(i)).c_str()); + // remove intermediary files + for (int i = 0; i < k; i++) + { + bf[i].close(); + std::remove((opt.temp_files + std::to_string(i)).c_str()); + } } } diff --git a/src/bustools_text.cpp b/src/bustools_text.cpp index 2c86d3e..9eff0dc 100644 --- a/src/bustools_text.cpp +++ b/src/bustools_text.cpp @@ -46,6 +46,9 @@ void bustools_text(const Bustools_opt& opt) { parseHeader(in, h); uint32_t bclen = h.bclen; uint32_t umilen = h.umilen; + if (opt.text_showall) { + bclen = 32; + } int rc = 0; while (true) { in.read((char*)p, N * sizeof(BUSData)); diff --git a/src/bustools_umicorrect.cpp b/src/bustools_umicorrect.cpp index 6efdadb..a6facbb 100644 --- a/src/bustools_umicorrect.cpp +++ b/src/bustools_umicorrect.cpp @@ -231,13 +231,13 @@ void bustools_umicorrect(const Bustools_opt& opt) { // read and parse the equivelence class files - std::unordered_map, int32_t, SortedVectorHasher> ecmapinv; + u_map_, int32_t, SortedVectorHasher> ecmapinv; std::vector> ecmap; - std::unordered_map txnames; + u_map_ txnames; parseTranscripts(opt.count_txp, txnames); std::vector genemap(txnames.size(), -1); - std::unordered_map genenames; + u_map_ genenames; parseGenes(opt.count_genes, txnames, genemap, genenames); parseECs(opt.count_ecs, h); ecmap = std::move(h.ecs); diff --git a/src/hash.cpp b/src/hash.cpp new file mode 100644 index 0000000..b2d18e1 --- /dev/null +++ b/src/hash.cpp @@ -0,0 +1,194 @@ +#include +#include +#include "hash.hpp" + +uint64_t inline _rotl64(uint64_t value, int8_t amount) { + return ((value) << (amount)) | ((value) >> (64 - (amount))); +} + +uint32_t SuperFastHash (const char *data, int len) { + uint32_t hash = len, tmp; + int rem; + + if (len <= 0 || data == NULL) { return 0; } + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (; len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + + + + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +inline uint64_t getblock ( const uint64_t *p, int i ) { + return p[i]; +} + +//---------- +// Block mix - combine the key bits with the hash bits and scramble everything + +inline void bmix64 ( uint64_t& h1, uint64_t& h2, uint64_t& k1, uint64_t& k2, uint64_t& c1, uint64_t& c2 ) { + k1 *= c1; + k1 = _rotl64(k1,23); + k1 *= c2; + h1 ^= k1; + h1 += h2; + + h2 = _rotl64(h2,41); + + k2 *= c2; + k2 = _rotl64(k2,23); + k2 *= c1; + h2 ^= k2; + h2 += h1; + + h1 = h1*3+0x52dce729; + h2 = h2*3+0x38495ab5; + + c1 = c1*5+0x7b7d159c; + c2 = c2*5+0x6bce6396; +} + +//---------- +// Finalization mix - avalanches all bits to within 0.05% bias + +inline uint64_t fmix64 ( uint64_t k ) { + k ^= k >> 33; + k *= 0xff51afd7ed558ccd; + k ^= k >> 33; + k *= 0xc4ceb9fe1a85ec53; + k ^= k >> 33; + + return k; +} + +void MurmurHash3_x64_128 ( const void *key, const int len, const uint32_t seed, void *out ) { + const uint8_t *data = (const uint8_t *)key; + const int nblocks = len / 16; + + uint64_t h1 = 0x9368e53c2f6af274 ^ seed; + uint64_t h2 = 0x586dcd208f7cd3fd ^ seed; + + uint64_t c1 = 0x87c37b91114253d5; + uint64_t c2 = 0x4cf5ad432745937f; + + //---------- + // body + + const uint64_t *blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + bmix64(h1,h2,k1,k2,c1,c2); + } + + //---------- + // tail + + const uint8_t *tail = (const uint8_t *)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) { + case 15: k2 ^= uint64_t(tail[14]) << 48; + case 14: k2 ^= uint64_t(tail[13]) << 40; + case 13: k2 ^= uint64_t(tail[12]) << 32; + case 12: k2 ^= uint64_t(tail[11]) << 24; + case 11: k2 ^= uint64_t(tail[10]) << 16; + case 10: k2 ^= uint64_t(tail[ 9]) << 8; + case 9: k2 ^= uint64_t(tail[ 8]) << 0; + + case 8: k1 ^= uint64_t(tail[ 7]) << 56; + case 7: k1 ^= uint64_t(tail[ 6]) << 48; + case 6: k1 ^= uint64_t(tail[ 5]) << 40; + case 5: k1 ^= uint64_t(tail[ 4]) << 32; + case 4: k1 ^= uint64_t(tail[ 3]) << 24; + case 3: k1 ^= uint64_t(tail[ 2]) << 16; + case 2: k1 ^= uint64_t(tail[ 1]) << 8; + case 1: k1 ^= uint64_t(tail[ 0]) << 0; + bmix64(h1,h2,k1,k2,c1,c2); + }; + + //---------- + // finalization + + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t *)out)[0] = h1; + ((uint64_t *)out)[1] = h2; +} + +//----------------------------------------------------------------------------- +// If we need a smaller hash value, it's faster to just use a portion of the +// 128-bit hash + +void MurmurHash3_x64_32 ( const void *key, int len, uint32_t seed, void *out ) { + uint32_t temp[4]; + + MurmurHash3_x64_128(key,len,seed,temp); + + *(uint32_t *)out = temp[0]; +} + +//---------- + +void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out ) { + uint64_t temp[2]; + + MurmurHash3_x64_128(key,len,seed,temp); + + *(uint64_t *)out = temp[0]; +} + +//----------------------------------------------------------------------------- + diff --git a/src/hash.hpp b/src/hash.hpp new file mode 100644 index 0000000..ab2c0d1 --- /dev/null +++ b/src/hash.hpp @@ -0,0 +1,22 @@ +#ifndef HASH_H +#define HASH_H + +#include /* Replace with if appropriate */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + +uint32_t SuperFastHash (const char *data, int len); + +//void MurmurHash3_x64_32 ( const void * key, int len, uint32_t seed, void * out ); +void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out ); + +#endif +