Skip to content

Commit

Permalink
[FIX] Parsing BAM headers with 64 reference sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Mar 4, 2021
1 parent 803ee1d commit 7d0230f
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ If possible, provide tooling that performs the changes, e.g. a shell-script.

* Requesting the alignment without also requesting the sequence for BAM files containing empty CIGAR strings does now
not result in erroneous parsing ([\#2418](https://github.com/seqan/seqan3/pull/2418)).
* BAM files with 64 references are now parsed correctly ([\#2423](https://github.com/seqan/seqan3/pull/2423)).

## API changes

Expand Down
42 changes: 33 additions & 9 deletions include/seqan3/io/sam_file/format_sam_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -493,16 +493,40 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
ref_seqs_type & /*ref_id_to_pos_map*/)
{
auto it = std::ranges::begin(stream_view);
auto end = std::ranges::end(stream_view);
std::vector<char> string_buffer{};

auto parse_tag_value = [&stream_view, &it, this] (auto & value) // helper function to parse the next tag value
auto take_until_predicate = [&it, &string_buffer] (auto const & predicate)
{
detail::consume(stream_view | views::take_until_or_throw(is_char<':'>)); // skip tag name
std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
read_field(stream_view | views::take_until_or_throw(is_char<'\t'> || is_char<'\n'>), value);
it = std::ranges::begin(stream_view); // make sure iterator is up2date
string_buffer.clear();
while (!predicate(*it))
{
string_buffer.push_back(*it);
++it;
}
};

auto skip_tag_name_and_colon = [&it] ()
{
while (!is_char<':'>(*it))
++it;
++it;
};

auto parse_tag_value = [&] (auto & value) // helper function to parse the next tag value
{
skip_tag_name_and_colon();
take_until_predicate(is_char<'\t'> || is_char<'\n'>);
read_field(string_buffer, value);
};

auto parse_until_newline = [&] (auto & value)
{
take_until_predicate(is_char<'\n'>);
read_field(string_buffer, value);
};

while (is_char<'@'>(*it))
while (it != end && is_char<'@'>(*it))
{
++it; // skip @

Expand Down Expand Up @@ -552,7 +576,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
if (is_char<'\t'>(*it)) // read rest of the tags
{
++it; // skip tab
read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), get<1>(info));
parse_until_newline(get<1>(info));
}
++it; // skip newline

Expand Down Expand Up @@ -594,7 +618,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
if (is_char<'\t'>(*it)) // read rest of the tags
{
++it;
read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), get<1>(tmp));
parse_until_newline(get<1>(tmp));
}
++it; // skip newline

Expand Down Expand Up @@ -653,7 +677,7 @@ inline void format_sam_base::read_header(stream_view_type && stream_view,
++it; // skip C
++it; // skip O
++it; // skip :
read_field(stream_view | views::take_until_or_throw(is_char<'\n'>), tmp);
parse_until_newline(tmp);
++it; // skip newline

hdr.comments.emplace_back(std::move(tmp));
Expand Down
41 changes: 41 additions & 0 deletions test/unit/io/sam_file/format_bam_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,47 @@ struct sam_file_read<seqan3::format_bam> : public sam_file_data
'\x66', '\x66', '\x66', '\x46', '\x40', '\x7A', '\x7A', '\x5A', '\x73', '\x74', '\x72', '\x00'
};

std::string many_refs{[] ()
{
// X = non-printable character
std::array<char, 2> buffer;
// B A M X ----------- 1345 ----------- @ H D
std::string result{'\x42', '\x41', '\x4D', '\x01', '\x41', '\x05', '\x00', '\x00', '\x40', '\x48', '\x44',
// \t V N : 1 . 6 \n
'\x09', '\x56', '\x4E', '\x3A', '\x31', '\x2E', '\x36', '\x0A'};
for (char i = '0'; i <= '9'; ++i)
// @ S Q \t S N : r e f
result += std::string{'\x40', '\x53', '\x51', '\x09', '\x53', '\x4E', '\x3A', '\x72', '\x65', '\x66',
// _ \t L N : 1 0 0 \n
'\x5F', i, '\x09', '\x4C', '\x4E', '\x3A', '\x31', '\x30', '\x30', '\x0A'};
for (size_t i = 10; i < 64; ++i)
{
std::to_chars(buffer.data(), buffer.data() + buffer.size(), i);
// @ S Q \t S N : r e f
result += std::string{'\x40', '\x53', '\x51', '\x09', '\x53', '\x4E', '\x3A', '\x72', '\x65', '\x66',
// _ \t L N : 1 0 0
'\x5F', buffer[0], buffer[1], '\x09', '\x4C', '\x4E', '\x3A', '\x31', '\x30', '\x30',
// \n
'\x0A'};
}
// ------------ 64 ------------
result += std::string{'\x40', '\x00', '\x00', '\x00'};
for (char i = '0'; i <= '9'; ++i)
// X X X X r e f _ X d
result += std::string{'\x06', '\x00', '\x00', '\x00', '\x72', '\x65', '\x66', '\x5F', i, '\x00', '\x64',
// X X X
'\x00', '\x00', '\x00'};
for (size_t i = 10; i < 64; ++i)
{
std::to_chars(buffer.data(), buffer.data() + buffer.size(), i);
// X X X X r e f _
result += std::string{'\x07', '\x00', '\x00', '\x00', '\x72', '\x65', '\x66', '\x5F', buffer[0], buffer[1],
// X d X X X
'\x00', '\x64', '\x00', '\x00', '\x00'};
}
return result;
}()};

/* bytes were modified to a ref id of 8448: \x00 \x00 \x21 \x00*/
std::string unknown_ref_header{
'\x42', '\x41', '\x4D', '\x01', '\x1C', '\x00', '\x00', '\x00', '\x40', '\x48', '\x44', '\x09', '\x56',
Expand Down
8 changes: 8 additions & 0 deletions test/unit/io/sam_file/format_sam_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./

std::string unknown_ref_header{"@HD\tVN:1.6\n@SQ\tSN:ref\tLN:34\n*\t0\tunknown_ref\t1\t0\t4M\t*\t0\t0\tAAAA\t*\n"};

std::string many_refs{[] ()
{
std::string result{"@HD\tVN:1.6\n"};
for (size_t i = 0; i < 64; ++i)
result += "@SQ\tSN:ref_" + std::to_string(i) + "\tLN:100\n";
return result;
}()};

// -----------------------------------------------------------------------------------------------------------------
// formatted output
// -----------------------------------------------------------------------------------------------------------------
Expand Down
14 changes: 13 additions & 1 deletion test/unit/io/sam_file/sam_file_format_test_template.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,17 @@ TYPED_TEST_P(sam_file_read, format_error_uneven_hexadecimal_tag)
EXPECT_THROW((fin.begin()), seqan3::format_error);
}

// https://github.com/seqan/seqan3/pull/2423
TYPED_TEST_P(sam_file_read, issue2423)
{
typename TestFixture::stream_type istream{this->many_refs};
seqan3::sam_file_input fin{istream, TypeParam{}};
ASSERT_NO_THROW(fin.begin());

EXPECT_EQ(fin.header().ref_id_info.size(), 64u);
EXPECT_EQ(fin.header().ref_dict.size(), 64u);
}

// ----------------------------------------------------------------------------
// sam_file_write
// ----------------------------------------------------------------------------
Expand Down Expand Up @@ -727,7 +738,8 @@ REGISTER_TYPED_TEST_SUITE_P(sam_file_read,
read_mate_but_not_ref_id_without_ref,
cigar_vector,
format_error_ref_id_not_in_reference_information,
format_error_uneven_hexadecimal_tag);
format_error_uneven_hexadecimal_tag,
issue2423);

REGISTER_TYPED_TEST_SUITE_P(sam_file_write,
write_empty_members,
Expand Down

0 comments on commit 7d0230f

Please sign in to comment.