Skip to content

Commit

Permalink
Merge pull request #92 from BUStools/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
Yenaled authored Jul 1, 2023
2 parents 2c312df + 7a11c5a commit 2be1e43
Show file tree
Hide file tree
Showing 18 changed files with 1,436 additions and 701 deletions.
10 changes: 5 additions & 5 deletions src/BUSData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ bool writeECs(const std::string &filename, const BUSHeader &header) {
return true;
}

bool writeGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &genenames) {
bool writeGenes(const std::string &filename, const u_map_<std::string, int32_t> &genenames) {
std::ofstream outf;
outf.open(filename.c_str(), std::ios::out);

Expand All @@ -279,7 +279,7 @@ bool writeGenes(const std::string &filename, const std::unordered_map<std::strin
return true;
}

bool parseTranscripts(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames) {
bool parseTranscripts(const std::string &filename, u_map_<std::string, int32_t> &txnames) {
std::ifstream inf(filename.c_str());

int i = 0;
Expand All @@ -291,7 +291,7 @@ bool parseTranscripts(const std::string &filename, std::unordered_map<std::strin
return true;
}

bool parseTxCaptureList(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures) {
bool parseTxCaptureList(const std::string &filename, u_map_<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures) {
std::ifstream inf(filename.c_str());

std::string txp;
Expand All @@ -318,7 +318,7 @@ bool parseBcUmiCaptureList(const std::string &filename, std::unordered_set<uint6
return true;
}

bool parse_ProjectMap(const std::string &filename, std::unordered_map<uint64_t, uint64_t> &project_map) {
bool parse_ProjectMap(const std::string &filename, u_map_<uint64_t, uint64_t> &project_map) {
// This function occurs in 3 places: here, BUSData.h, and bustools_project.cpp
std::ifstream inf(filename.c_str());

Expand Down Expand Up @@ -346,7 +346,7 @@ bool parseFlagsCaptureList(const std::string &filename, std::unordered_set<uint6
return true;
}

bool parseGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, std::unordered_map<std::string, int32_t> &genenames) {
bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, u_map_<std::string, int32_t> &genenames) {
std::ifstream inf(filename.c_str());

std::string line, t;
Expand Down
11 changes: 6 additions & 5 deletions src/BUSData.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <unordered_set>
#include <stdint.h>
#include <fstream>
#include "Common.hpp"

const uint32_t BUSFORMAT_VERSION = 1;

Expand Down Expand Up @@ -66,15 +67,15 @@ int identifyParseHeader(std::istream &inf, BUSHeader &header, compressed_BUSHead
bool parseECs_stream(std::istream &in, BUSHeader &header);
bool parseECs(const std::string &filename, BUSHeader &header);
bool writeECs(const std::string &filename, const BUSHeader &header);
bool writeGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &genenames);
bool parseGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, std::unordered_map<std::string, int32_t> &genenames);
bool writeGenes(const std::string &filename, const u_map_<std::string, int32_t> &genenames);
bool parseGenes(const std::string &filename, const u_map_<std::string, int32_t> &txnames, std::vector<int32_t> &genemap, u_map_<std::string, int32_t> &genenames);
bool parseGenesList(const std::string& filename, std::vector<std::string>& geneNames);
bool parseTxCaptureList(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures);
bool parseTxCaptureList(const std::string &filename, u_map_<std::string, int32_t> &txnames, std::unordered_set<uint64_t> &captures);
bool parseBcUmiCaptureList(const std::string &filename, std::unordered_set<uint64_t> &captures);
bool parseFlagsCaptureList(const std::string &filename, std::unordered_set<uint64_t> &captures);
bool parseTranscripts(const std::string &filename, std::unordered_map<std::string, int32_t> &txnames);
bool parseTranscripts(const std::string &filename, u_map_<std::string, int32_t> &txnames);

bool parse_ProjectMap(const std::string &filename, std::unordered_map<uint64_t, uint64_t> &project_map);
bool parse_ProjectMap(const std::string &filename, u_map_<uint64_t, uint64_t> &project_map);

uint64_t stringToBinary(const std::string &s, uint32_t &flag);
uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag);
Expand Down
83 changes: 78 additions & 5 deletions src/Common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ std::vector<int32_t> intersect_vectors(const std::vector<std::vector<int32_t>> &
return std::move(u);
}

int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes) {
int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes) {
if (ecs.empty()) {
return -1;
}
Expand All @@ -91,13 +91,13 @@ int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u,
for (size_t i = 0; i< v.size(); i++) {
u.push_back(v[i]);
}

for (size_t i = 1; i < ecs.size(); i++) {
if (ecs[i] < 0 || ecs[i] >= ecmap.size()) {
return -1;
}
const auto &v = ecmap[ecs[i]];

int j = 0;
int k = 0;
int l = 0;
Expand Down Expand Up @@ -212,7 +212,7 @@ void intersect_genes_of_ecs(const std::vector<int32_t> &ecs, const std::vector<
}


int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty) {
int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty) {

std::vector<std::vector<int32_t>> gu; // per gene transcript results
std::vector<int32_t> u; // final list of transcripts
Expand Down Expand Up @@ -327,9 +327,82 @@ void create_ec2genes(const std::vector<std::vector<int32_t>> &ecmap, const std::
}
}

COUNT_MTX_TYPE intersect_ecs_with_subset_txs(int32_t ec, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split) {
if (tx_split.size() == 0) return COUNT_DEFAULT;
std::vector<int32_t> ecs;
ecs.push_back(ec);
return intersect_ecs_with_subset_txs(ecs, ecmap, tx_split);
}

COUNT_MTX_TYPE intersect_ecs_with_subset_txs(const std::vector<int32_t>& ecs, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split) {
// Note: tx_split indices are tx ids and values are 1 (exists in split) or 0 (does not exist in split)
if (tx_split.size() == 0) return COUNT_DEFAULT;
if (ecs.size() == 0) return COUNT_AMBIGUOUS; // Shouldn't happen
std::vector<int32_t> u;
u.resize(0);
auto &v = ecmap[ecs[0]]; // copy
for (size_t i = 0; i< v.size(); i++) {
u.push_back(v[i]);
}
for (size_t i = 1; i < ecs.size(); i++) {
const auto &v = ecmap[ecs[i]];

int j = 0;
int k = 0;
int l = 0;
int n = u.size();
int m = v.size();
// u and v are sorted, j,k,l = 0
while (j < n && l < m) {
// invariant: u[:k] is the intersection of u[:j] and v[:l], j <= n, l <= m
// u[:j] <= u[j:], v[:l] <= v[l:], u[j:] is sorted, v[l:] is sorted, u[:k] is sorted
if (u[j] < v[l]) {
j++;
} else if (u[j] > v[l]) {
l++;
} else {
// match
if (k < j) {
std::swap(u[k], u[j]);
}
k++;
j++;
l++;
}
}
if (k < n) {
u.resize(k);
}
}
size_t n_1 = 0;
size_t n_2 = 0;
for (auto t : u) {
if(tx_split[t]) {
n_2++;
} else {
n_1++;
}
if (n_1 > 0 && n_2 > 0) break; // Stop searching
}
return (n_1 > 0 && n_2 > 0 ? COUNT_AMBIGUOUS : (n_1 > 0 ? COUNT_DEFAULT : COUNT_SPLIT));
/*for (auto ec : ecs) { // We still need to optimize this
for (auto t: ecmap[ec]) {
if(std::find(tx_split.begin(), tx_split.end(), t) != tx_split.end()) {
n_2++;
} else {
n_1++;
}
if (n_1 > 0 && n_2 > 0) break; // Stop searching
}
if (n_1 > 0 && n_2 > 0) break; // Stop searching
}*/
//return (n_1 > 0 && n_2 > 0 ? COUNT_AMBIGUOUS : (n_1 > 0 ? COUNT_DEFAULT : COUNT_SPLIT));
}


void copy_file(std::string src, std::string dest) {
std::ifstream isrc(src, std::ios::binary);
std::ofstream idest(dest, std::ios::binary);

idest << isrc.rdbuf();
}
}
42 changes: 37 additions & 5 deletions src/Common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@
#include <string>
#include <unordered_map>
#include <sstream>
#include "roaring.hh"
#include "hash.hpp"

#define BUSTOOLS_VERSION "0.42.0"
#define BUSTOOLS_VERSION "0.43.0"

#define u_map_ std::unordered_map
enum CAPTURE_TYPE : char
{
CAPTURE_NONE = 0,
Expand All @@ -35,6 +38,12 @@ enum PROJECT_TYPE : char
PROJECT_TX,
PROJECT_F
};
enum COUNT_MTX_TYPE : char
{
COUNT_DEFAULT = 0,
COUNT_SPLIT,
COUNT_AMBIGUOUS
};

struct Bustools_opt
{
Expand Down Expand Up @@ -62,6 +71,7 @@ struct Bustools_opt
std::string count_genes;
std::string count_ecs;
std::string count_txp;
std::string count_split;
bool count_em = false;
bool count_cm = false;
bool count_collapse = false;
Expand All @@ -75,6 +85,7 @@ struct Bustools_opt
std::string dump;
bool dump_bool = false;
bool split_correct = false;
bool barcode_replacement = false;

/* predict */
std::string predict_input; //specified the same way as the output for count - count and histogram filenames will be created from this
Expand All @@ -98,6 +109,7 @@ struct Bustools_opt
/* text */
bool text_dumpflags = false;
bool text_dumppad = false;
bool text_showall = false;

/* linker */
int start, end;
Expand Down Expand Up @@ -151,22 +163,42 @@ struct SortedVectorHasher
int i = 0;
for (auto x : v)
{
uint64_t t = std::hash<int32_t>{}(x);
uint64_t t;
MurmurHash3_x64_64(&x,sizeof(x), 0,&t);
t = (x >> i) | (x << (64 - i));
r = r ^ t;
i = (i + 1) % 64;
i = (i+1)&63;
}
return r;
}
};

struct RoaringHasher {
size_t operator()(const Roaring& rr) const {
uint64_t r = 0;
int i=0;
for (auto x : rr) {
uint64_t t;
MurmurHash3_x64_64(&x, sizeof(x), 0, &t);
t = (x>>i) | (x<<(64-i));
r ^= t;
i = (i+1)&63; // (i+1)%64
}
return r;
}
};
typedef u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> EcMapInv;

std::vector<int32_t> intersect(std::vector<int32_t> &u, std::vector<int32_t> &v);
std::vector<int32_t> union_vectors(const std::vector<std::vector<int32_t>> &v);
std::vector<int32_t> intersect_vectors(const std::vector<std::vector<int32_t>> &v);
int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes);
int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes);
void vt2gene(const std::vector<int32_t> &v, const std::vector<int32_t> &genemap, std::vector<int32_t> &glist);
void intersect_genes_of_ecs(const std::vector<int32_t> &ecs, const std::vector<std::vector<int32_t>> &ec2genes, std::vector<int32_t> &glist);
int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty = true);
int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty = true);
void create_ec2genes(const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ec2gene);
COUNT_MTX_TYPE intersect_ecs_with_subset_txs(int32_t ec, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split);
COUNT_MTX_TYPE intersect_ecs_with_subset_txs(const std::vector<int32_t>& ecs, const std::vector<std::vector<int32_t>> &ecmap, const std::vector<int32_t>& tx_split);

void copy_file(std::string src, std::string dest);

Expand Down
4 changes: 2 additions & 2 deletions src/bustools_capture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ void bustools_capture(Bustools_opt &opt) {

std::unordered_set<uint64_t> captures;
std::vector<std::vector<int32_t>> ecmap;
std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;

if (opt.type == CAPTURE_TX) {
// parse ecmap and capture list
std::unordered_map<std::string, int32_t> txnames;
u_map_<std::string, int32_t> txnames;
std::cerr << "Parsing transcripts .. "; std::cerr.flush();
parseTranscripts(opt.count_txp, txnames);
std::cerr << "done" << std::endl;
Expand Down
8 changes: 4 additions & 4 deletions src/bustools_clusterhist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ void bustools_clusterhist(Bustools_opt& opt) {

// read and parse the equivelence class files

std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
std::vector<std::vector<int32_t>> ecmap;

std::unordered_map<std::string, int32_t> txnames;
u_map_<std::string, int32_t> txnames;
parseTranscripts(opt.count_txp, txnames);
std::vector<int32_t> genemap(txnames.size(), -1);
std::unordered_map<std::string, int32_t> genenames;
u_map_<std::string, int32_t> genenames;
parseGenes(opt.count_genes, txnames, genemap, genenames);
parseECs(opt.count_ecs, h);
ecmap = std::move(h.ecs);
Expand Down Expand Up @@ -52,7 +52,7 @@ void bustools_clusterhist(Bustools_opt& opt) {

//Read the cluster file
std::vector<std::string> clusterNames;
std::unordered_map<uint64_t, size_t> bcClusters;
u_map_<uint64_t, size_t> bcClusters;
{
std::ifstream ifs(opt.cluster_input_file);
uint32_t flag = 0;
Expand Down
6 changes: 3 additions & 3 deletions src/bustools_collapse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ void bustools_collapse(Bustools_opt &opt) {

// read and parse the equivelence class files

std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
std::vector<std::vector<int32_t>> ecmap;

std::unordered_map<std::string, int32_t> txnames;
u_map_<std::string, int32_t> txnames;
parseTranscripts(opt.count_txp, txnames);
std::vector<int32_t> genemap(txnames.size(), -1);
std::unordered_map<std::string, int32_t> genenames;
u_map_<std::string, int32_t> genenames;
parseGenes(opt.count_genes, txnames, genemap, genenames);
parseECs(opt.count_ecs, h);
ecmap = std::move(h.ecs);
Expand Down
Loading

0 comments on commit 2be1e43

Please sign in to comment.