Skip to content

Commit

Permalink
Merge pull request #253 from smithlabcode/chrom-order-check-for-xcounts
Browse files Browse the repository at this point in the history
Chrom order checks in xcounts
  • Loading branch information
andrewdavidsmith authored Nov 20, 2024
2 parents 85d3dce + cb04f30 commit b058e49
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 44 deletions.
65 changes: 42 additions & 23 deletions src/common/counts_header.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@

#include "counts_header.hpp"

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cassert>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#include "bam_record_utils.hpp"

Expand All @@ -34,48 +34,63 @@
#include "bamxx.hpp"
#include "dnmt_error.hpp"

using std::vector;
using std::string;
using std::to_string;
using std::unordered_map;
using std::vector;

using bamxx::bgzf_file;

void
std::unordered_map<std::string, std::uint32_t>
write_counts_header_from_chrom_sizes(const vector<string> &chrom_names,
const vector<uint64_t> &chrom_sizes,
bgzf_file &out) {
const auto version = "#DNMTOOLS " + string(VERSION) + "\n";
out.write(version.c_str());

std::unordered_map<std::string, std::uint32_t> chrom_order;
std::uint32_t chrom_count = 0;

for (auto i = 0u; i < size(chrom_sizes); ++i) {
const string tmp =
"#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n";
out.write(tmp.c_str());
out.write(tmp.data());
chrom_order.emplace(chrom_names[i], chrom_count++);
}
out.write("#\n");
}

return chrom_order;
}

void
std::unordered_map<std::string, std::uint32_t>
write_counts_header_from_file(const string &header_file, bgzf_file &out) {
std::ifstream in(header_file);
if (!in.is_open()) {
throw dnmt_error("failed to open header file: " + header_file);
throw dnmt_error("failed to open header file: " + header_file);
}

std::unordered_map<std::string, std::uint32_t> chrom_order;
std::uint32_t chrom_count = 0;

string line;
while(getline(in, line)) {
while (getline(in, line)) {
out.write(line + '\n');
const auto name = line.substr(1, line.find(' ') - 1);
chrom_order.emplace(name, chrom_count++);
}
in.close();
}

return chrom_order;
}

bamxx::bgzf_file &
skip_counts_header(bamxx::bgzf_file &in) {

// use the kstring_t type to more directly use the BGZF file
kstring_t line{0, 0, nullptr};
const int ret = ks_resize(&line, 1024);
if (ret) return in;
if (ret)
return in;

while (bamxx::getline(in, line) && line.s[0] == '#') {
if (line.s[0] == '#' && line.l == 1)
Expand All @@ -86,7 +101,6 @@ skip_counts_header(bamxx::bgzf_file &in) {
return in;
}


int
get_chrom_sizes_for_counts_header(const uint32_t n_threads,
const string &filename,
Expand All @@ -96,7 +110,8 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads,
bamxx::bam_tpool tpool(n_threads);

bgzf_file in(filename, "r");
if (!in) return -1;
if (!in)
return -1;
if (n_threads > 1 && in.is_bgzf())
tpool.set_io(in);

Expand All @@ -106,18 +121,22 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads,
// use the kstring_t type to more directly use the BGZF file
kstring_t line{0, 0, nullptr};
const int ret = ks_resize(&line, 1024);
if (ret) return -1;
if (ret)
return -1;

uint64_t chrom_size = 0;
while (getline(in, line)) {
if (line.s[0] == '>') {
if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
if (!chrom_names.empty())
chrom_sizes.push_back(chrom_size);
chrom_names.emplace_back(line.s + 1);
chrom_size = 0;
}
else chrom_size += line.l;
else
chrom_size += line.l;
}
if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
if (!chrom_names.empty())
chrom_sizes.push_back(chrom_size);

ks_free(&line);

Expand All @@ -126,7 +145,6 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads,
return 0;
}


void
write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
bgzf_file &out) {
Expand All @@ -142,7 +160,6 @@ write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
out.write("#\n");
}


bool
write_counts_header_line(string line, bgzf_file &out) {
line += '\n';
Expand All @@ -153,8 +170,10 @@ write_counts_header_line(string line, bgzf_file &out) {
bool
get_has_counts_header(const string &filename) {
bgzf_file in(filename, "r");
if (!in) return false;
if (!in)
return false;
string line;
if (!getline(in, line)) return false;
if (!getline(in, line))
return false;
return line[0] == '#';
}
15 changes: 8 additions & 7 deletions src/common/counts_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,19 @@
#ifndef COUNTS_HEADER_HPP
#define COUNTS_HEADER_HPP

#include <cstdint>
#include <string>
#include <unordered_map>
#include <vector>
#include <cstdint>

#include "bamxx.hpp"

void
write_counts_header_from_chrom_sizes(const std::vector<std::string> &chrom_names,
const std::vector<uint64_t> &chrom_sizes,
bamxx::bgzf_file &out);
std::unordered_map<std::string, std::uint32_t>
write_counts_header_from_chrom_sizes(
const std::vector<std::string> &chrom_names,
const std::vector<uint64_t> &chrom_sizes, bamxx::bgzf_file &out);

void
std::unordered_map<std::string, std::uint32_t>
write_counts_header_from_file(const std::string &header_file,
bamxx::bgzf_file &out);

Expand Down Expand Up @@ -60,7 +61,7 @@ is_counts_header_version_line(const std::string &line) {
return line.compare(0, 9, version_line) == 0;
}

template<typename T>
template <typename T>
inline bool
is_counts_header_line(T &line) {
return line[0] == '#';
Expand Down
96 changes: 82 additions & 14 deletions src/utils/xcounts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <stdexcept>
#include <string>
#include <system_error>
#include <type_traits> // std::underlying_type_t
#include <vector>

// from smithlab_cpp
Expand All @@ -45,6 +46,49 @@ using std::vector;

using bamxx::bgzf_file;

enum class xcounts_err {
// clang-format off
ok = 0,
open_failure = 1,
header_failure = 2,
chromosome_not_found = 3,
inconsistent_chromosome_order = 4,
incorrect_chromosome_size = 5,
failed_to_write_file = 6,
failed_to_parse_site = 7,
// clang-format on
};

struct xcounts_err_cat : std::error_category {
auto name() const noexcept -> const char * override {
return "xcounts error";
}
auto message(const int condition) const -> std::string override {
using std::string_literals::operator""s;
switch (condition) {
case 0: return "ok"s;
case 1: return "failed to open methylome file"s;
case 2: return "failed to parse xcounts header"s;
case 3: return "failed to find chromosome in xcounts header"s;
case 4: return "inconsistent chromosome order"s;
case 5: return "incorrect chromosome size"s;
case 6: return "failed to write to file"s;
case 7: return "failed to parse site"s;
}
std::abort(); // unreacheable
}
};

template <>
struct std::is_error_code_enum<xcounts_err> : public std::true_type {};

std::error_code
make_error_code(xcounts_err e) {
static auto category = xcounts_err_cat{};
return std::error_code(static_cast<std::underlying_type_t<xcounts_err>>(e),
category);
}

template <typename T>
static inline uint32_t
fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) {
Expand Down Expand Up @@ -137,10 +181,12 @@ main_xcounts(int argc, const char **argv) {
tpool.set_io(out);
}

std::unordered_map<string, uint32_t> chrom_order;
if (!header_file.empty())
write_counts_header_from_file(header_file, out);
chrom_order = write_counts_header_from_file(header_file, out);
else if (!genome_file.empty())
write_counts_header_from_chrom_sizes(chrom_names, chrom_sizes, out);
chrom_order =
write_counts_header_from_chrom_sizes(chrom_names, chrom_sizes, out);

// use the kstring_t type to more directly use the BGZF file
kstring_t line{0, 0, nullptr};
Expand All @@ -152,11 +198,14 @@ main_xcounts(int argc, const char **argv) {

uint32_t offset = 0;
string prev_chrom;
bool status_ok = true;
bool found_header = (!genome_file.empty() || !header_file.empty());

std::error_code ec{};

uint32_t chrom_counter = 0;

MSite site;
while (status_ok && bamxx::getline(in, line)) {
while (ec == std::errc{} && bamxx::getline(in, line)) {
if (is_counts_header_line(line.s)) {
if (!genome_file.empty() || !header_file.empty())
continue;
Expand All @@ -166,34 +215,53 @@ main_xcounts(int argc, const char **argv) {
continue;
}

status_ok = site.initialize(line.s, line.s + line.l);
if (!status_ok || !found_header)
if (!site.initialize(line.s, line.s + line.l)) {
ec = xcounts_err::failed_to_parse_site;
break;
}
if (!found_header) {
ec = xcounts_err::header_failure;
break;
}

if (site.chrom != prev_chrom) {
if (verbose)
cerr << "processing: " << site.chrom << endl;

if (!chrom_order.empty()) {
const auto expected_chrom_counter = chrom_order.find(site.chrom);
if (expected_chrom_counter == std::cend(chrom_order)) {
ec = xcounts_err::chromosome_not_found;
break;
}
if (expected_chrom_counter->second != chrom_counter) {
ec = xcounts_err::inconsistent_chromosome_order;
break;
}
}

chrom_counter++;

prev_chrom = site.chrom;
offset = 0;

site.chrom += '\n';
const int64_t sz = size(site.chrom);
status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz;
if (bgzf_write(out.f, site.chrom.data(), sz) != sz)
ec = xcounts_err::failed_to_write_file;
}
if (site.n_reads > 0) {
const int64_t sz = fill_output_buffer(offset, site, buf);
status_ok = bgzf_write(out.f, buf.data(), sz) == sz;
if (bgzf_write(out.f, buf.data(), sz) != sz)
ec = xcounts_err::failed_to_write_file;
offset = site.pos;
}
}
ks_free(&line);

if (!status_ok) {
cerr << "failed converting " << filename << " to " << outfile << endl;
return EXIT_FAILURE;
}
if (!found_header) {
cerr << "no header provided or found" << endl;
if (ec) {
cerr << "failed converting " << filename << " to " << outfile << endl
<< ec << endl;
return EXIT_FAILURE;
}
}
Expand Down

0 comments on commit b058e49

Please sign in to comment.