Skip to content

Commit

Permalink
Merge pull request #85 from BUStools/devel_compress
Browse files Browse the repository at this point in the history
Merge compression / decompression
  • Loading branch information
pmelsted authored Nov 28, 2022
2 parents ff58b1c + 95c3cc6 commit 671f60b
Show file tree
Hide file tree
Showing 8 changed files with 2,267 additions and 0 deletions.
124 changes: 124 additions & 0 deletions src/BUSData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,36 @@ uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag) {
return r;
}

int identifyParseHeader(std::istream &inf, BUSHeader &header, compressed_BUSHeader &comp_header)
{
int ret = -1;
char magic[5];

magic[4] = '\0';

inf.read(magic, 4);
for (int i = 0; i < 4; ++i)
{
inf.putback(magic[3 - i]);
}

if (std::strcmp(magic, "BUS\0") == 0)
{
return BUSFILE_TYPE::BUSFILE * parseHeader(inf, header);
}
else if (std::strcmp(magic, "BUS\1") == 0)
{
return BUSFILE_TYPE::BUSFILE_COMPRESED * parseCompressedHeader(inf, comp_header);
}
else if(std::strcmp(magic, "BZI\0") == 0){
return BUSFILE_TYPE::BUSZ_INDEX;
}
else if (std::strcmp(magic, "BEC\0") == 0)
{
return BUSFILE_TYPE::EC_MATRIX_COMPRESSED;
}
return BUSFILE_TYPE::EC_MATRIX;
}

bool parseHeader(std::istream &inf, BUSHeader &header) {
char magic[4];
Expand All @@ -95,7 +125,80 @@ bool parseHeader(std::istream &inf, BUSHeader &header) {
return true;
}

bool parseCompressedHeader(std::istream &inf, compressed_BUSHeader &compheader)
{
char magic[5];
magic[4] = '\0';

BUSHeader &header = compheader.bus_header;
inf.read(magic, 4);
if (std::strcmp(magic, "BUS\1") != 0)
{
std::cerr << "Invalid header magic\n";
return false;
}
inf.read((char *)(&header.version), sizeof(header.version));
if (header.version != BUSFORMAT_VERSION)
{
return false;
}
inf.read((char *)(&header.bclen), sizeof(header.bclen));
inf.read((char *)(&header.umilen), sizeof(header.umilen));
uint32_t tlen = 0;
inf.read((char *)(&tlen), sizeof(tlen));
char *t = new char[tlen + 1];
inf.read(t, tlen);
t[tlen] = '\0';
header.text.assign(t);
delete[] t;

// We store the compressed_header-specific information after the regular header
inf.read((char *)&compheader.chunk_size, sizeof(compheader.chunk_size));
inf.read((char *)&compheader.pfd_blocksize, sizeof(compheader.pfd_blocksize));
inf.read((char *)&compheader.lossy_umi, sizeof(compheader.lossy_umi));

return true;
}

bool parseECs_stream(std::istream &in, BUSHeader &header)
{
auto &ecs = header.ecs;
std::string line, t;
line.reserve(10000);

std::vector<int32_t> c;

int i = 0;
bool has_reached = false;
while (std::getline(in, line))
{
c.clear();
int ec = -1;
if (line.size() == 0)
{
continue;
}
std::stringstream ss(line);
ss >> ec;
assert(ec == i);
while (std::getline(ss, t, ','))
{
c.push_back(std::stoi(t));
}
if (!has_reached)
{
has_reached |= !(c.size() == 1 && c[0] == i);
if (has_reached)
{
std::cerr << "first line is " << i << '\n';
}
}

ecs.push_back(std::move(c));
++i;
}
return true;
}

bool parseECs(const std::string &filename, BUSHeader &header) {
auto &ecs = header.ecs;
Expand Down Expand Up @@ -294,3 +397,24 @@ bool writeHeader(std::ostream &outf, const BUSHeader &header) {

return true;
}

bool writeCompressedHeader(std::ostream &outf, const compressed_BUSHeader &compheader)
{
outf.write("BUS\1", 4);

// We start writing out the contents of the general header
const auto header = compheader.bus_header;
outf.write((char *)(&header.version), sizeof(header.version));
outf.write((char *)(&header.bclen), sizeof(header.bclen));
outf.write((char *)(&header.umilen), sizeof(header.umilen));
uint32_t tlen = header.text.size();
outf.write((char *)(&tlen), sizeof(tlen));
outf.write((char *)header.text.c_str(), tlen);

// We end by writing out the compressed-header-specific data
outf.write((char *)(&compheader.chunk_size), sizeof(compheader.chunk_size));
outf.write((char *)&compheader.pfd_blocksize, sizeof(compheader.pfd_blocksize));
outf.write((char *)(&compheader.lossy_umi), sizeof(compheader.lossy_umi));

return true;
}
21 changes: 21 additions & 0 deletions src/BUSData.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ struct BUSHeader {
BUSHeader() : version(0), bclen(0), umilen(0) {}
};

struct compressed_BUSHeader
{
uint32_t chunk_size;
uint32_t lossy_umi;
uint32_t pfd_blocksize = 512;
BUSHeader bus_header;
compressed_BUSHeader() : chunk_size(0), lossy_umi(0) {}
};

struct BUSData {
uint64_t barcode;
uint64_t UMI;
Expand All @@ -38,11 +47,23 @@ struct BUSData {
BUSData() : barcode(0), UMI(0), ec(-1), count(0), flags(0), pad(0) {}
};

enum BUSFILE_TYPE
{
BUSFILE = 1,
BUSFILE_COMPRESED = 2,
BUSZ_INDEX = 3,
EC_MATRIX = 4,
EC_MATRIX_COMPRESSED = 5
};

bool parseHeader(std::istream &inf, BUSHeader &header);
bool writeHeader(std::ostream &outf, const BUSHeader &header);

bool parseCompressedHeader(std::istream &inf, compressed_BUSHeader &header);
bool writeCompressedHeader(std::ostream &inf, const compressed_BUSHeader &header);
int identifyParseHeader(std::istream &inf, BUSHeader &header, compressed_BUSHeader &comp_header);

bool parseECs_stream(std::istream &in, BUSHeader &header);
bool parseECs(const std::string &filename, BUSHeader &header);
bool writeECs(const std::string &filename, const BUSHeader &header);
bool writeGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &genenames);
Expand Down
6 changes: 6 additions & 0 deletions src/Common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ struct Bustools_opt
/* linker */
int start, end;

/* Compression */
std::string busz_index;
uint32_t chunk_size = 100000;
uint32_t lossy_umi = 0;
uint32_t pfd_blocksize = 512;

Bustools_opt() : threads(1), max_memory(1ULL << 32), type(0),
threshold(0), start(-1), end(-1) {}
};
Expand Down
Loading

0 comments on commit 671f60b

Please sign in to comment.