Skip to content

Commit

Permalink
undo bitmap and unordered map
Browse files Browse the repository at this point in the history
  • Loading branch information
Yenaled committed Mar 1, 2023
1 parent 3d3df88 commit e3b6ff4
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 2,586 deletions.
84 changes: 49 additions & 35 deletions src/Common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ std::vector<int32_t> intersect_vectors(const std::vector<std::vector<int32_t>> &
return std::move(u);
}

int32_t intersect_ecs(const std::vector<int32_t> &ecs, Roaring &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes) {
int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes) {
if (ecs.empty()) {
return -1;
}
Expand All @@ -85,36 +85,59 @@ int32_t intersect_ecs(const std::vector<int32_t> &ecs, Roaring &u, const std::ve
if (ecs.size() == 1) {
return ecs[0]; // no work
}

uint32_t *data = reinterpret_cast<uint32_t*>(const_cast<int32_t*>(&(ecmap[ecs[0]][0])));
u = Roaring(ecmap[ecs[0]].size(), data);

u.resize(0);
auto &v = ecmap[ecs[0]]; // copy
for (size_t i = 0; i< v.size(); i++) {
u.push_back(v[i]);
}

for (size_t i = 1; i < ecs.size(); i++) {
if (ecs[i] < 0 || ecs[i] >= ecmap.size()) {
return -1;
}
data = reinterpret_cast<uint32_t*>(const_cast<int32_t*>(&(ecmap[ecs[i]][0])));
u &= Roaring(ecmap[ecs[i]].size(), data);
const auto &v = ecmap[ecs[i]];

int j = 0;
int k = 0;
int l = 0;
int n = u.size();
int m = v.size();
// u and v are sorted, j,k,l = 0
while (j < n && l < m) {
// invariant: u[:k] is the intersection of u[:j] and v[:l], j <= n, l <= m
// u[:j] <= u[j:], v[:l] <= v[l:], u[j:] is sorted, v[l:] is sorted, u[:k] is sorted
if (u[j] < v[l]) {
j++;
} else if (u[j] > v[l]) {
l++;
} else {
// match
if (k < j) {
std::swap(u[k], u[j]);
}
k++;
j++;
i++;
}
}
if (k < n) {
u.resize(k);
}
}

if (u.isEmpty()) {
if (u.empty()) {
return -1;
}
auto iit = ecmapinv.find(u);
if (iit == ecmapinv.end()) {
// create new equivalence class
int32_t ec = ecmap.size();
uint32_t* u_arr = new uint32_t[u.cardinality()];
u.toUint32Array(u_arr);
std::vector<int32_t> u_vec;
u_vec.reserve(u.cardinality());
for (size_t i = 0; i < u.cardinality(); i++) u_vec.push_back(static_cast<int32_t>(u_arr[i]));
delete[] u_arr;
ecmap.push_back(u_vec);
ecmap.push_back(u);
ecmapinv.insert({u,ec});
// figure out the gene list
std::vector<int32_t> v;
vt2gene(u_vec, genemap, v);
vt2gene(u, genemap, v);
ec2genes.push_back(std::move(v));
return ec;
} else {
Expand Down Expand Up @@ -192,7 +215,7 @@ void intersect_genes_of_ecs(const std::vector<int32_t> &ecs, const std::vector<
int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty) {

std::vector<std::vector<int32_t>> gu; // per gene transcript results
Roaring u; // final list of transcripts
std::vector<int32_t> u; // final list of transcripts
std::vector<int32_t> glist;

int32_t lastg = -2;
Expand Down Expand Up @@ -222,9 +245,11 @@ int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vec
// frequent case, single gene replace with union
for (auto ec : ecs) {
for (const auto &t : ecmap[ec]) {
u.add(t);
u.push_back(t);
}
}
std::sort(u.begin(), u.end());
u.erase(std::unique(u.begin(), u.end()), u.end());

// look up ecs based on u
int32_t ec = -1;
Expand All @@ -235,15 +260,9 @@ int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vec
} else {
ec = ecmapinv.size();
ecmapinv.insert({u,ec});
uint32_t* u_arr = new uint32_t[u.cardinality()];
u.toUint32Array(u_arr);
std::vector<int32_t> u_vec;
u_vec.reserve(u.cardinality());
for (size_t i = 0; i < u.cardinality(); i++) u_vec.push_back(static_cast<int32_t>(u_arr[i]));
delete[] u_arr;
ecmap.push_back(u_vec);
ecmap.push_back(u);
std::vector<int32_t> v;
vt2gene(u_vec, genemap, v);
vt2gene(u, genemap, v);
ec2genes.push_back(std::move(v));
}

Expand Down Expand Up @@ -272,13 +291,14 @@ int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vec
}

for (auto t : uu) {
u.add(t);
u.push_back(t);
}
}

if (u.isEmpty()) {
if (u.empty()) {
return -1;
}
std::sort(u.begin(), u.end());

int32_t ec = -1;
auto it = ecmapinv.find(u);
Expand All @@ -287,15 +307,9 @@ int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vec
} else {
ec = ecmapinv.size();
ecmapinv.insert({u,ec});
uint32_t* u_arr = new uint32_t[u.cardinality()];
u.toUint32Array(u_arr);
std::vector<int32_t> u_vec;
u_vec.reserve(u.cardinality());
for (size_t i = 0; i < u.cardinality(); i++) u_vec.push_back(static_cast<int32_t>(u_arr[i]));
delete[] u_arr;
ecmap.push_back(u_vec);
ecmap.push_back(u);
std::vector<int32_t> v;
vt2gene(u_vec, genemap, v);
vt2gene(u, genemap, v);
ec2genes.push_back(std::move(v));
}
return ec;
Expand Down
7 changes: 3 additions & 4 deletions src/Common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
#include <string>
#include <unordered_map>
#include <sstream>
#include "robin_hood.h"
#include "roaring.hh"
#include "hash.hpp"

#define BUSTOOLS_VERSION "0.42.0"

#define u_map_ robin_hood::unordered_flat_map
#define u_map_ std::unordered_map
enum CAPTURE_TYPE : char
{
CAPTURE_NONE = 0,
Expand Down Expand Up @@ -186,12 +185,12 @@ struct RoaringHasher {
return r;
}
};
typedef u_map_<Roaring, int32_t, RoaringHasher> EcMapInv;
typedef u_map_<std::vector<int32_t>, int32_t, SortedVectorHasher> EcMapInv;

std::vector<int32_t> intersect(std::vector<int32_t> &u, std::vector<int32_t> &v);
std::vector<int32_t> union_vectors(const std::vector<std::vector<int32_t>> &v);
std::vector<int32_t> intersect_vectors(const std::vector<std::vector<int32_t>> &v);
int32_t intersect_ecs(const std::vector<int32_t> &ecs, Roaring &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes);
int32_t intersect_ecs(const std::vector<int32_t> &ecs, std::vector<int32_t> &u, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes);
void vt2gene(const std::vector<int32_t> &v, const std::vector<int32_t> &genemap, std::vector<int32_t> &glist);
void intersect_genes_of_ecs(const std::vector<int32_t> &ecs, const std::vector<std::vector<int32_t>> &ec2genes, std::vector<int32_t> &glist);
int32_t intersect_ecs_with_genes(const std::vector<int32_t> &ecs, const std::vector<int32_t> &genemap, std::vector<std::vector<int32_t>> &ecmap, EcMapInv &ecmapinv, std::vector<std::vector<int32_t>> &ec2genes, bool assumeIntersectionIsEmpty = true);
Expand Down
6 changes: 3 additions & 3 deletions src/bustools_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ void bustools_count(Bustools_opt &opt) {
ecmap = std::move(h.ecs);
ecmapinv.reserve(ecmap.size());
for (int32_t ec = 0; ec < ecmap.size(); ec++) {
uint32_t *data = reinterpret_cast<uint32_t*>(const_cast<int32_t*>(&(ecmap[ec][0])));
ecmapinv.insert({Roaring(ecmap[ec].size(), data), ec});
ecmapinv.insert({ecmap[ec], ec});
}
std::vector<std::vector<int32_t>> ec2genes;
create_ec2genes(ecmap, genemap, ec2genes);
Expand Down Expand Up @@ -88,7 +87,8 @@ void bustools_count(Bustools_opt &opt) {
std::vector<int32_t> ecs;
std::vector<int32_t> glist;
ecs.reserve(100);
Roaring u;
std::vector<int32_t> u;
u.reserve(100);
std::vector<int32_t> column_v;
std::vector<std::pair<int32_t, std::pair<double, COUNT_MTX_TYPE>>> column_vp; // gene, {count, matrix type}
if (!opt.count_collapse) {
Expand Down
Loading

0 comments on commit e3b6ff4

Please sign in to comment.