Skip to content

Commit

Permalink
[FEAT] option to not ignore blanks before the id in FASTA files
Browse files Browse the repository at this point in the history
  • Loading branch information
SGSSGene committed Mar 29, 2022
1 parent 576322c commit d5947f8
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 19 deletions.
61 changes: 42 additions & 19 deletions include/seqan3/io/sequence_file/format_fasta.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,11 @@ class format_fasta
throw parse_error{std::string{"Expected to be on beginning of ID, but "} + is_id.msg +
" evaluated to false on " + detail::make_printable(*begin(stream_view))};

// read id
if constexpr (!detail::decays_to_ignore_v<id_type>)
if constexpr (detail::decays_to_ignore_v<id_type>) // Skip the ID, it is not requested by the user.
{
detail::consume(stream_view | detail::take_line_or_throw);
}
else // read ID
{
if (options.truncate_ids)
{
Expand All @@ -196,8 +199,11 @@ class format_fasta
auto e = stream_view.end();
++it; // already checked `is_id`

for (; (it != e) && (is_blank)(*it); ++it)
{}
if (options.fasta_ignore_blanks_before_id)
{
for (; (it != e) && (is_blank)(*it); ++it) // skip leading ' '
{}
}

bool at_delimiter = false;
for (; it != e; ++it)
Expand All @@ -217,27 +223,39 @@ class format_fasta
{}

#else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓

if (options.fasta_ignore_blanks_before_id)
{
std::ranges::copy(stream_view | std::views::drop(1) // skip leading '>' or ';'
| std::views::drop_while(is_blank) // skip leading ' '
| detail::take_until_or_throw(is_cntrl || is_blank) // read ID until delimiter…
| views::char_to<std::ranges::range_value_t<id_type>>,
std::cpp20::back_inserter(id)); // … ^A is old delimiter
}
else
{
std::ranges::copy(stream_view | std::views::drop(1) // skip leading '>' or ';'
| std::views::drop_while(is_blank) // skip leading ' '
| detail::take_until_or_throw(is_cntrl || is_blank) // read ID until delimiter…
| views::char_to<std::ranges::range_value_t<id_type>>,
std::cpp20::back_inserter(id)); // … ^A is old delimiter
}

// consume rest of line
detail::consume(stream_view | detail::take_line_or_throw);
#endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE

}
else
else // options.truncate_ids
{
#if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
auto it = stream_view.begin();
auto e = stream_view.end();
++it; // skip leading '>' or ';'

for (; (it != e) && (is_blank)(*it); ++it) // skip leading ' '
{}
if (options.fasta_ignore_blanks_before_id)
{
for (; (it != e) && (is_blank)(*it); ++it) // skip leading ' '
{}
}

bool at_delimiter = false;
for (; it != e; ++it)
Expand All @@ -254,19 +272,24 @@ class format_fasta
throw unexpected_end_of_input{"FASTA ID line did not end in newline."};

#else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓

std::ranges::copy(stream_view | detail::take_line_or_throw // read line
| std::views::drop(1) // skip leading '>' or ';'
| std::views::drop_while(is_blank) // skip leading ' '
| views::char_to<std::ranges::range_value_t<id_type>>,
std::cpp20::back_inserter(id));
if (options.fasta_ignore_blanks_before_id)
{
std::ranges::copy(stream_view | detail::take_line_or_throw // read line
| std::views::drop(1) // skip leading '>' or ';'
| std::views::drop_while(is_blank) // skip leading ' '
| views::char_to<std::ranges::range_value_t<id_type>>,
std::cpp20::back_inserter(id));
}
else
{
std::ranges::copy(stream_view | detail::take_line_or_throw // read line
| std::views::drop(1) // skip leading '>' or ';'
| views::char_to<std::ranges::range_value_t<id_type>>,
std::cpp20::back_inserter(id));
}
#endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
}
}
else
{
detail::consume(stream_view | detail::take_line_or_throw);
}
}

//!\brief Implementation of reading the sequence.
Expand Down
2 changes: 2 additions & 0 deletions include/seqan3/io/sequence_file/input_options.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ struct sequence_file_input_options
bool truncate_ids = false;
//!\brief Read the complete_header into the seqan3::field::id for embl or genbank format.
bool embl_genbank_complete_header = false;
//!\brief Remove spaces after ">" (or ";") before the actual ID.
bool fasta_ignore_blanks_before_id = true;
};

} // namespace seqan3

0 comments on commit d5947f8

Please sign in to comment.