diff --git a/include/compare.h b/include/compare.h index f4f17af..bdd1757 100644 --- a/include/compare.h +++ b/include/compare.h @@ -18,7 +18,7 @@ inline constexpr static uint64_t adjust_seed(uint8_t const kmer_size, uint64_t c return seed >> (64u - 2u * kmer_size); } -enum methods {kmer = 0, minimiser, strobemer}; +enum methods {kmer = 0, minimiser, modmers, strobemer}; struct minimiser_arguments { diff --git a/include/minimiser_distance.hpp b/include/minimiser_distance.hpp new file mode 100644 index 0000000..c3651aa --- /dev/null +++ b/include/minimiser_distance.hpp @@ -0,0 +1,549 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides seqan3::views::minimiser_distance. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace seqan3::detail +{ +// --------------------------------------------------------------------------------------------------------------------- +// minimiser_distance_view class +// --------------------------------------------------------------------------------------------------------------------- + +/*!\brief The type returned by seqan3::views::minimiser_distance. + * \tparam urng1_t The type of the underlying range, must model std::ranges::forward_range, the reference type must + * model std::totally_ordered. The typical use case is that the reference type is the result of + * seqan3::kmer_hash. + * \tparam urng2_t The type of the second underlying range, must model std::ranges::forward_range, the reference type + * must model std::totally_ordered. If only one range is provided this defaults to + * std::ranges::empty_view. + * \implements std::ranges::view + * \ingroup search_views + * + * \details + * + * See seqan3::views::minimiser_distance for a detailed explanation on minimizers. + * + * \note Most members of this class are generated by std::ranges::view_interface which is not yet documented here. + * + * \sa seqan3::views::minimiser_distance + */ +template > +class minimiser_distance_view : public std::ranges::view_interface> +{ +private: + static_assert(std::ranges::forward_range, "The minimiser_distance_view only works on forward_ranges."); + static_assert(std::ranges::forward_range, "The minimiser_distance_view only works on forward_ranges."); + static_assert(std::totally_ordered>, + "The reference type of the underlying range must model std::totally_ordered."); + + //!\brief The default argument of the second range. + using default_urng2_t = std::ranges::empty_view; + + //!\brief Boolean variable, which is true, when second range is not of empty type. + static constexpr bool second_range_is_given = !std::same_as; + + static_assert(!second_range_is_given || std::totally_ordered_with, + std::ranges::range_reference_t>, + "The reference types of the underlying ranges must model std::totally_ordered_with."); + + //!\brief Whether the given ranges are const_iterable + static constexpr bool const_iterable = seqan3::const_iterable_range && + seqan3::const_iterable_range; + + //!\brief The first underlying range. + urng1_t urange1{}; + //!\brief The second underlying range. + urng2_t urange2{}; + + //!\brief The number of values in one window. + size_t window_size{}; + + template + class basic_iterator; + + //!\brief The sentinel type of the minimiser_distance_view. + using sentinel = std::default_sentinel_t; + +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + minimiser_distance_view() = default; //!< Defaulted. + minimiser_distance_view(minimiser_distance_view const & rhs) = default; //!< Defaulted. + minimiser_distance_view(minimiser_distance_view && rhs) = default; //!< Defaulted. + minimiser_distance_view & operator=(minimiser_distance_view const & rhs) = default; //!< Defaulted. + minimiser_distance_view & operator=(minimiser_distance_view && rhs) = default; //!< Defaulted. + ~minimiser_distance_view() = default; //!< Defaulted. + + /*!\brief Construct from a view and a given number of values in one window. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + minimiser_distance_view(urng1_t urange1, size_t const window_size) : + minimiser_distance_view{std::move(urange1), default_urng2_t{}, window_size} + {} + + /*!\brief Construct from a non-view that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + template + //!\cond + requires (std::ranges::viewable_range && + std::constructible_from>>) + //!\endcond + minimiser_distance_view(other_urng1_t && urange1, size_t const window_size) : + urange1{std::views::all(std::forward(urange1))}, + urange2{default_urng2_t{}}, + window_size{window_size} + {} + + /*!\brief Construct from two views and a given number of values in one window. + * \param[in] urange1 The first input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + minimiser_distance_view(urng1_t urange1, urng2_t urange2, size_t const window_size) : + urange1{std::move(urange1)}, + urange2{std::move(urange2)}, + window_size{window_size} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + + /*!\brief Construct from two non-views that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \tparam other_urng2_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng2_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] urange2 The second input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + */ + template + //!\cond + requires (std::ranges::viewable_range && + std::constructible_from> && + std::ranges::viewable_range && + std::constructible_from>) + //!\endcond + minimiser_distance_view(other_urng1_t && urange1, other_urng2_t && urange2, size_t const window_size) : + urange1{std::views::all(std::forward(urange1))}, + urange2{std::views::all(std::forward(urange2))}, + window_size{window_size} + { + if constexpr (second_range_is_given) + { + if (std::ranges::distance(urange1) != std::ranges::distance(urange2)) + throw std::invalid_argument{"The two ranges do not have the same size."}; + } + } + //!\} + + /*!\name Iterators + * \{ + */ + /*!\brief Returns an iterator to the first element of the range. + * \returns Iterator to the first element. + * + * \details + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * Strong exception guarantee. + */ + basic_iterator begin() + { + return {std::ranges::begin(urange1), + std::ranges::end(urange1), + std::ranges::begin(urange2), + window_size}; + } + + //!\copydoc begin() + basic_iterator begin() const + //!\cond + requires const_iterable + //!\endcond + { + return {std::ranges::cbegin(urange1), + std::ranges::cend(urange1), + std::ranges::cbegin(urange2), + window_size}; + } + + /*!\brief Returns an iterator to the element following the last element of the range. + * \returns Iterator to the end. + * + * \details + * + * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + sentinel end() const + { + return {}; + } + //!\} +}; + +//!\brief Iterator for calculating minimiser_distances. +template +template +class minimiser_distance_view::basic_iterator +{ +private: + //!\brief The sentinel type of the first underlying range. + using urng1_sentinel_t = maybe_const_sentinel_t; + //!\brief The iterator type of the first underlying range. + using urng1_iterator_t = maybe_const_iterator_t; + //!\brief The iterator type of the second underlying range. + using urng2_iterator_t = maybe_const_iterator_t; + + template + friend class basic_iterator; + +public: + /*!\name Associated types + * \{ + */ + //!\brief Type for distances between iterators. + using difference_type = std::ranges::range_difference_t; + //!\brief Value type of this iterator. + using value_type = std::ranges::range_value_t; + //!\brief The pointer type. + using pointer = void; + //!\brief Reference to `value_type`. + using reference = value_type; + //!\brief Tag this class as a forward iterator. + using iterator_category = std::forward_iterator_tag; + //!\brief Tag this class as a forward iterator. + using iterator_concept = iterator_category; + //!\} + + /*!\name Constructors, destructor and assignment + * \{ + */ + basic_iterator() = default; //!< Defaulted. + basic_iterator(basic_iterator const &) = default; //!< Defaulted. + basic_iterator(basic_iterator &&) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. + ~basic_iterator() = default; //!< Defaulted. + + //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. + basic_iterator(basic_iterator const & it) + //!\cond + requires const_range + //!\endcond + : minimiser_distance_value{std::move(it.minimiser_distance_value)}, + urng1_iterator{std::move(it.urng1_iterator)}, + urng1_sentinel{std::move(it.urng1_sentinel)}, + urng2_iterator{std::move(it.urng2_iterator)}, + window_values{std::move(it.window_values)} + {} + + /*!\brief Construct from begin and end iterators of a given range over std::totally_ordered values, and the number + of values per window. + * \param[in] urng1_iterator Iterator pointing to the first position of the first std::totally_ordered range. + * \param[in] urng1_sentinel Iterator pointing to the last position of the first std::totally_ordered range. + * \param[in] urng2_iterator Iterator pointing to the first position of the second std::totally_ordered range. + * \param[in] window_size The number of values in one window. + * + * \details + * + * Looks at the number of values per window in two ranges, returns the smallest between both as minimiser_distance and + * shifts then by one to repeat this action. If a minimiser_distance in consecutive windows is the same, it is returned only + * once. + */ + basic_iterator(urng1_iterator_t urng1_iterator, + urng1_sentinel_t urng1_sentinel, + urng2_iterator_t urng2_iterator, + size_t window_size) : + urng1_iterator{std::move(urng1_iterator)}, + urng1_sentinel{std::move(urng1_sentinel)}, + urng2_iterator{std::move(urng2_iterator)} + { + size_t size = std::ranges::distance(urng1_iterator, urng1_sentinel); + window_size = std::min(window_size, size); + + window_first(window_size); + } + //!\} + + //!\anchor basic_iterator_comparison + //!\name Comparison operators + //!\{ + + //!\brief Compare to another basic_iterator. + friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) + { + return (lhs.urng1_iterator == rhs.urng1_iterator) && + (rhs.urng2_iterator == rhs.urng2_iterator) && + (lhs.window_values.size() == rhs.window_values.size()); + } + + //!\brief Compare to another basic_iterator. + friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the minimiser_distance_view. + friend bool operator==(basic_iterator const & lhs, sentinel const &) + { + return lhs.urng1_iterator == lhs.urng1_sentinel; + } + + //!\brief Compare to the sentinel of the minimiser_distance_view. + friend bool operator==(sentinel const & lhs, basic_iterator const & rhs) + { + return rhs == lhs; + } + + //!\brief Compare to the sentinel of the minimiser_distance_view. + friend bool operator!=(sentinel const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the minimiser_distance_view. + friend bool operator!=(basic_iterator const & lhs, sentinel const & rhs) + { + return !(lhs == rhs); + } + //!\} + + //!\brief Pre-increment. + basic_iterator & operator++() noexcept + { + next_unique_minimiser_distance(); + return *this; + } + + //!\brief Post-increment. + basic_iterator operator++(int) noexcept + { + basic_iterator tmp{*this}; + next_unique_minimiser_distance(); + return tmp; + } + + //!\brief Return the minimiser_distance. + value_type operator*() const noexcept + { + return minimiser_distance_value; + } + +private: + //!\brief The minimiser_distance value. + value_type minimiser_distance_value{}; + //!\brief The minimiser value. + value_type minimiser_value{}; + //!\brief Helper to track distance. + size_t distance{}; + + //!\brief The offset relative to the beginning of the window where the minimizer value is found. + size_t minimiser_distance_position_offset{}; + + //!\brief Iterator to the rightmost value of one window. + urng1_iterator_t urng1_iterator{}; + //!brief Iterator to last element in range. + urng1_sentinel_t urng1_sentinel{}; + //!\brief Iterator to the rightmost value of one window of the second range. + urng2_iterator_t urng2_iterator{}; + + //!\brief Stored values per window. It is necessary to store them, because a shift can remove the current minimiser_distance. + std::deque window_values{}; + + //!\brief Increments iterator by 1. + void next_unique_minimiser_distance() + { + while (!next_minimiser_distance()) {} + } + + //!\brief Returns new window value. + auto window_value() const + { + if constexpr (!second_range_is_given) + return *urng1_iterator; + else + return std::min(*urng1_iterator, *urng2_iterator); + } + + //!\brief Advances the window to the next position. + void advance_window() + { + ++urng1_iterator; + if constexpr (second_range_is_given) + ++urng2_iterator; + } + + //!\brief Calculates minimiser_distances for the first window. + void window_first(size_t const window_size) + { + if (window_size == 0u) + return; + + for (size_t i = 0u; i < window_size - 1u; ++i) + { + window_values.push_back(window_value()); + advance_window(); + } + window_values.push_back(window_value()); + auto minimiser_distance_it = std::ranges::min_element(window_values, std::less_equal{}); + minimiser_value = *minimiser_distance_it; + minimiser_distance_value = std::distance(std::begin(window_values), minimiser_distance_it); + minimiser_distance_position_offset = std::distance(std::begin(window_values), minimiser_distance_it); + distance = window_values.size() - minimiser_distance_value - 1; + } + + /*!\brief Calculates the next minimiser_distance value. + * \returns True, if new minimiser_distance is found or end is reached. Otherwise returns false. + * \details + * For the following windows, we remove the first window value (is now not in window_values) and add the new + * value that results from the window shifting. + */ + bool next_minimiser_distance() + { + advance_window(); + if (urng1_iterator == urng1_sentinel) + return true; + + value_type const new_value = window_value(); + + window_values.pop_front(); + window_values.push_back(new_value); + + if (minimiser_distance_position_offset == 0) + { + auto minimiser_distance_it = std::ranges::min_element(window_values, std::less_equal{}); + minimiser_distance_value = std::distance(std::begin(window_values), minimiser_distance_it); + minimiser_distance_position_offset = std::distance(std::begin(window_values), minimiser_distance_it); + minimiser_value = *minimiser_distance_it; + distance = window_values.size() - minimiser_distance_value - 1; + return true; + } + + if (new_value < minimiser_value) + { + minimiser_distance_value = distance; + distance = 0; + minimiser_distance_position_offset = window_values.size() - 1; + minimiser_value = new_value; + return true; + } + + distance++; + --minimiser_distance_position_offset; + return false; + } +}; + +//!\brief A deduction guide for the view class template. +template +minimiser_distance_view(rng1_t &&, size_t const window_size) -> minimiser_distance_view>; + +//!\brief A deduction guide for the view class template. +template +minimiser_distance_view(rng1_t &&, rng2_t &&, size_t const window_size) -> minimiser_distance_view, + std::views::all_t>; + +// --------------------------------------------------------------------------------------------------------------------- +// minimiser_distance_fn (adaptor definition) +// --------------------------------------------------------------------------------------------------------------------- + +//![adaptor_def] +//!\brief seqan3::views::minimiser_distance's range adaptor object type (non-closure). +//!\ingroup search_views +struct minimiser_distance_fn +{ + //!\brief Store the number of values in one window and return a range adaptor closure object. + constexpr auto operator()(size_t const window_size) const + { + return adaptor_from_functor{*this, window_size}; + } + + /*!\brief Call the view's constructor with two arguments: the underlying view and an integer indicating how many + * values one window contains. + * \tparam urng1_t The type of the input range to process. Must model std::ranges::viewable_range. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] window_size The number of values in one window. + * \returns A range of converted values. + */ + template + constexpr auto operator()(urng1_t && urange1, size_t const window_size) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::minimiser_distance cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::minimiser_distance must model std::ranges::forward_range."); + + if (window_size == 1) // Would just return urange1 without any changes + throw std::invalid_argument{"The chosen window_size is not valid. " + "Please choose a value greater than 1 or use two ranges."}; + + return minimiser_distance_view{urange1, window_size}; + } +}; +//![adaptor_def] + +} // namespace seqan3::detail + +/*!\brief Computes the distance of minimiser for a range of comparable values. A minimiser_distance is the smallest value in a window. + * \tparam urng_t The type of the first range being processed. [template + * parameter is omitted in pipe notation] + * \param[in] urange1 The range being processed. [parameter is omitted in pipe notation] + * \param[in] window_size The number of values in one window. + * \returns A range of std::totally_ordered where each value is the minimal value for one window. See below for the + * properties of the returned range. + * \ingroup search_views + * + * \details + * + * For more information look into seqan3::views::minimiser. + */ +inline constexpr auto minimiser_distance = seqan3::detail::minimiser_distance_fn{}; diff --git a/include/minimiser_hash_distance.hpp b/include/minimiser_hash_distance.hpp new file mode 100644 index 0000000..3c8ebb5 --- /dev/null +++ b/include/minimiser_hash_distance.hpp @@ -0,0 +1,114 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides seqan3::views::minimiser_distance_hash. + */ + +#pragma once + +#include +#include +#include +#include + +#include "minimiser_distance.hpp" + + +namespace seqan3::detail +{ +//!\brief seqan3::views::minimiser_distance_hash's range adaptor object type (non-closure). +//!\ingroup search_views +struct minimiser_distance_hash_fn +{ + /*!\brief Store the shape and the window size and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The windows size to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, window_size const window_size) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, window_size}; + } + + /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The size of the window. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, window_size const window_size, seed const seed) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, window_size, seed}; + } + + /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type + * of the range must model seqan3::semialphabet. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] window_size The size of the window. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `window_size`. + * \returns A range of converted elements. + */ + template + constexpr auto operator()(urng_t && urange, + shape const & shape, + window_size const window_size, + seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::minimiser_distance_hash cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::minimiser_distance_hash must model std::ranges::forward_range."); + static_assert(semialphabet>, + "The range parameter to views::minimiser_distance_hash must be over elements of seqan3::semialphabet."); + + if (shape.size() > window_size.get()) + throw std::invalid_argument{"The size of the shape cannot be greater than the window size."}; + + auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}); + + auto reverse_strand = std::forward(urange) | seqan3::views::complement + | std::views::reverse + | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}) + | std::views::reverse; + + return minimiser_distance_view(forward_strand, reverse_strand, window_size.get() - shape.size() + 1); + } +}; + +} // namespace seqan3::detail + + +/*!\name Alphabet related views + * \{ + */ + +/*!\brief Computes the distance of minimisers for a range with a given shape, window size and seed. + * \tparam urng_t The type of the range being processed. + * \param[in] urange The range being processed. [parameter is omitted in pipe notation] + * \param[in] shape The seqan3::shape that determines how to compute the hash value. + * \param[in] window_size The window size to use. + * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. + * \returns A range of `size_t` where each value is the minimiser of the resp. window. + * See below for the properties of the returned range. + * \ingroup utility_views + * + * \details + * For more information look into seqan3::views::minimiser_hash + */ +inline constexpr auto minimiser_hash_distance = seqan3::detail::minimiser_distance_hash_fn{}; + +//!\} diff --git a/include/modmer.hpp b/include/modmer.hpp new file mode 100644 index 0000000..b4cf1e6 --- /dev/null +++ b/include/modmer.hpp @@ -0,0 +1,443 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides modmer. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace seqan3::detail +{ +// --------------------------------------------------------------------------------------------------------------------- +// modmer_view class +// --------------------------------------------------------------------------------------------------------------------- + +/*!\brief The type returned by modmer. + * \tparam urng1_t The type of the underlying range, must model std::ranges::forward_range, the reference type must + * model std::totally_ordered. The typical use case is that the reference type is the result of + * seqan3::kmer_hash. + * \tparam measure_distance If true, then not the actual modmers are returned, but the distances of the modmers. + * \implements std::ranges::view + * \ingroup search_views + * + * + * \note Most members of this class are generated by std::ranges::view_interface which is not yet documented here. + + */ +template +class modmer_view : public std::ranges::view_interface> +{ +private: + static_assert(std::ranges::forward_range, "The modmer_view only works on forward_ranges."); + static_assert(std::totally_ordered>, + "The reference type of the underlying range must model std::totally_ordered."); + + //!\brief Whether the given ranges are const_iterable + static constexpr bool const_iterable = seqan3::const_iterable_range; + + //!\brief The first underlying range. + urng1_t urange1{}; + + //!\brief The number of values in one window. + size_t mod_used{}; + + template + class basic_iterator; + + //!\brief The sentinel type of the modmer_view. + using sentinel = std::default_sentinel_t; + +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + /// \cond Workaround_Doxygen + modmer_view() requires std::default_initializable = default; //!< Defaulted. + /// \endcond + modmer_view(modmer_view const & rhs) = default; //!< Defaulted. + modmer_view(modmer_view && rhs) = default; //!< Defaulted. + modmer_view & operator=(modmer_view const & rhs) = default; //!< Defaulted. + modmer_view & operator=(modmer_view && rhs) = default; //!< Defaulted. + ~modmer_view() = default; //!< Defaulted. + + /*!\brief Construct from a view and a given number of values in one window. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] mod_used The number of values in one window. + */ + modmer_view(urng1_t urange1, size_t const mod_used) : + urange1{std::move(urange1)}, + mod_used{mod_used} + {} + + /*!\brief Construct from a non-view that can be view-wrapped and a given number of values in one window. + * \tparam other_urng1_t The type of another urange. Must model std::ranges::viewable_range and be constructible + from urng1_t. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] mod_used The number of values in one window. + */ + template + //!\cond + requires (std::ranges::viewable_range && + std::constructible_from>>) + //!\endcond + modmer_view(other_urng1_t && urange1, size_t const mod_used) : + urange1{std::views::all(std::forward(urange1))}, + mod_used{mod_used} + {} + + /*!\name Iterators + * \{ + */ + /*!\brief Returns an iterator to the first element of the range. + * \returns Iterator to the first element. + * + * \details + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * Strong exception guarantee. + */ + basic_iterator begin() + { + return {std::ranges::begin(urange1), + std::ranges::end(urange1), + mod_used}; + } + + //!\copydoc begin() + basic_iterator begin() const + //!\cond + requires const_iterable + //!\endcond + { + return {std::ranges::cbegin(urange1), + std::ranges::cend(urange1), + mod_used}; + } + + /*!\brief Returns an iterator to the element following the last element of the range. + * \returns Iterator to the end. + * + * \details + * + * This element acts as a placeholder; attempting to dereference it results in undefined behaviour. + * + * ### Complexity + * + * Constant. + * + * ### Exceptions + * + * No-throw guarantee. + */ + sentinel end() const + { + return {}; + } + //!\} +}; + +//!\brief Iterator for calculating modmers. +template +template +class modmer_view::basic_iterator +{ +private: + //!\brief The sentinel type of the first underlying range. + using urng1_sentinel_t = maybe_const_sentinel_t; + //!\brief The iterator type of the first underlying range. + using urng1_iterator_t = maybe_const_iterator_t; + + template + friend class basic_iterator; + +public: + /*!\name Associated types + * \{ + */ + //!\brief Type for distances between iterators. + using difference_type = std::ranges::range_difference_t; + //!\brief Value type of this iterator. + using value_type = std::ranges::range_value_t; + //!\brief The pointer type. + using pointer = void; + //!\brief Reference to `value_type`. + using reference = value_type; + //!\brief Tag this class as a forward iterator. + using iterator_category = std::forward_iterator_tag; + //!\brief Tag this class as a forward iterator. + using iterator_concept = iterator_category; + //!\} + + /*!\name Constructors, destructor and assignment + * \{ + */ + basic_iterator() = default; //!< Defaulted. + basic_iterator(basic_iterator const &) = default; //!< Defaulted. + basic_iterator(basic_iterator &&) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator const &) = default; //!< Defaulted. + basic_iterator & operator=(basic_iterator &&) = default; //!< Defaulted. + ~basic_iterator() = default; //!< Defaulted. + + //!\brief Allow iterator on a const range to be constructible from an iterator over a non-const range. + basic_iterator(basic_iterator const & it) + //!\cond + requires const_range + //!\endcond + : modmer_value{std::move(it.modmer_value)}, + urng1_iterator{std::move(it.urng1_iterator)}, + urng1_sentinel{std::move(it.urng1_sentinel)} + {} + + /*!\brief Construct from begin and end iterators of a given range over std::totally_ordered values, and the number + of values per window. + * \param[in] urng1_iterator Iterator pointing to the first position of the first std::totally_ordered range. + * \param[in] urng1_sentinel Iterator pointing to the last position of the first std::totally_ordered range. + * \param[in] mod_used The number of values in one window. + * + * \details + * + * Looks at the number of values per window in two ranges, returns the smallest between both as modmer and + * shifts then by one to repeat this action. If a modmer in consecutive windows is the same, it is returned only + * once. + */ + basic_iterator(urng1_iterator_t urng1_iterator, + urng1_sentinel_t urng1_sentinel, + size_t mod_used) : + urng1_iterator{std::move(urng1_iterator)}, + urng1_sentinel{std::move(urng1_sentinel)}, + mod{mod_used} + { + size_t size = std::ranges::distance(urng1_iterator, urng1_sentinel); + mod_used = std::min(mod_used, size); + + first_modmer(); + } + //!\} + + //!\anchor basic_iterator_comparison_modmer + //!\name Comparison operators + //!\{ + + //!\brief Compare to another basic_iterator. + friend bool operator==(basic_iterator const & lhs, basic_iterator const & rhs) + { + return (lhs.urng1_iterator == rhs.urng1_iterator); + } + + //!\brief Compare to another basic_iterator. + friend bool operator!=(basic_iterator const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the modmer_view. + friend bool operator==(basic_iterator const & lhs, sentinel const &) + { + return lhs.urng1_iterator == lhs.urng1_sentinel; + } + + //!\brief Compare to the sentinel of the modmer_view. + friend bool operator==(sentinel const & lhs, basic_iterator const & rhs) + { + return rhs == lhs; + } + + //!\brief Compare to the sentinel of the modmer_view. + friend bool operator!=(sentinel const & lhs, basic_iterator const & rhs) + { + return !(lhs == rhs); + } + + //!\brief Compare to the sentinel of the modmer_view. + friend bool operator!=(basic_iterator const & lhs, sentinel const & rhs) + { + return !(lhs == rhs); + } + //!\} + + //!\brief Pre-increment. + basic_iterator & operator++() noexcept + { + next_unique_modmer(); + return *this; + } + + //!\brief Post-increment. + basic_iterator operator++(int) noexcept + { + basic_iterator tmp{*this}; + next_unique_modmer(); + return tmp; + } + + //!\brief Return the modmer. + value_type operator*() const noexcept + { + return modmer_value; + } + +private: + //!\brief The modmer value. + value_type modmer_value{}; + + //!\brief Iterator to the rightmost value of one window. + urng1_iterator_t urng1_iterator{}; + //!brief Iterator to last element in range. + urng1_sentinel_t urng1_sentinel{}; + + //!brief The mod value used. + size_t mod{}; + //!brief The distance stored. Only relevant, if measure_distance is true. + size_t distance{1}; + + //!\brief Advances the window to the next position. + void advance() + { + distance++; + ++urng1_iterator; + } + + void first_modmer() + { + if (!next_modmer()) + next_unique_modmer(); + } + + //!\brief Increments iterator by 1. + void next_unique_modmer() + { + while (1) + { + advance(); + if (next_modmer()) + break; + } + } + + /*!\brief Calculates the next modmer value. + * \returns True, if new modmer is found or end is reached. Otherwise returns false. + */ + bool next_modmer() + { + if (urng1_iterator == urng1_sentinel) + return true; + + if (*urng1_iterator % mod == 0) + { + if constexpr (measure_distance) + { + modmer_value = distance - 1; + distance = 0; + } + else + { + modmer_value = *urng1_iterator; + } + return true; + } + + return false; + } +}; + +//!\brief A deduction guide for the view class template. +template +modmer_view(rng1_t &&, size_t const mod_used) -> modmer_view>; + +template +modmer_view(rng1_t &&, size_t const mod_used) -> modmer_view, m1>; + + +// --------------------------------------------------------------------------------------------------------------------- +// modmer_fn (adaptor definition) +// --------------------------------------------------------------------------------------------------------------------- + +//![adaptor_def] +//!\brief modmer's range adaptor object type (non-closure). +//!\ingroup search_views +struct modmer_fn +{ + //!\brief Store the number of values in one window and return a range adaptor closure object. + constexpr auto operator()(size_t const mod_used) const + { + return adaptor_from_functor{*this, mod_used}; + } + + /*!\brief Call the view's constructor with two arguments: the underlying view and an integer indicating how many + * values one window contains. + * \tparam urng1_t The type of the input range to process. Must model std::ranges::viewable_range. + * \param[in] urange1 The input range to process. Must model std::ranges::viewable_range and + * std::ranges::forward_range. + * \param[in] mod_used The number of values in one window. + * \returns A range of converted values. + */ + template + constexpr auto operator()(urng1_t && urange1, size_t const mod_used) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::modmer cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::modmer must model std::ranges::forward_range."); + + return modmer_view{urange1, mod_used}; + } +}; +//![adaptor_def] + +} // namespace seqan3::detail + +/*!\brief Computes modmers for a range of comparable values. A modmer is a value that fullfills the + condition value % mod_used. + * \tparam urng_t The type of the first range being processed. See below for requirements. [template + * parameter is omitted in pipe notation] + * \param[in] urange1 The range being processed. [parameter is omitted in pipe notation] + * \param[in] mod_used The mod value used. + * \returns A range of std::totally_ordered where each value is ... See below for the + * properties of the returned range. + * \ingroup search_views + * + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *lost* | + * | std::ranges::common_range | | *lost* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | std::totally_ordered | std::totally_ordered | + * + * See the views views submodule documentation for detailed descriptions of the view properties. + */ +inline constexpr auto modmer = seqan3::detail::modmer_fn{}; diff --git a/include/modmer_hash.hpp b/include/modmer_hash.hpp new file mode 100644 index 0000000..a17114b --- /dev/null +++ b/include/modmer_hash.hpp @@ -0,0 +1,142 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides modmer_hash. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "modmer.hpp" +#include "shared.hpp" + +namespace seqan3::detail +{ +//!\brief seqan3::views::modmer_hash's range adaptor object type (non-closure). +//!\ingroup search_views +struct modmer_hash_fn +{ + /*!\brief Store the shape and the window size and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] mod_used The mod value to use. + * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, uint32_t const mod_used) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, mod_used}; + } + + /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] mod_used The mod value to use. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, uint32_t const mod_used, seed const seed) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, mod_used, seed}; + } + + /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type + * of the range must model seqan3::semialphabet. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] mod_used The mod value to use. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. + * \returns A range of converted elements. + */ + template + constexpr auto operator()(urng_t && urange, + shape const & shape, + uint32_t const mod_used, + seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::modmer_hash cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::modmer_hash must model std::ranges::forward_range."); + static_assert(semialphabet>, + "The range parameter to views::modmer_hash must be over elements of seqan3::semialphabet."); + + if (mod_used == 1) // Would just return urange1 without any changes + throw std::invalid_argument{"The chosen mod_used is not valid. " + "Please choose a value greater than 1."}; + + auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}); + + auto reverse_strand = std::forward(urange) | seqan3::views::complement + | std::views::reverse + | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}) + | std::views::reverse; + // fnv_hash ensures actual randomness. + auto combined_strand = seqan3::views::zip(forward_strand, reverse_strand) | std::views::transform([seed](std::tuple i){return fnv_hash(std::get<0>(i) + std::get<1>(i), seed.get());}); + return seqan3::detail::modmer_view(combined_strand, mod_used); + } +}; + +} // namespace seqan3::detail + +/*!\name Alphabet related views + * \{ + */ + +/*!\brief Computes modmers for a range with a given shape, mod_used and seed. + * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is + * omitted in pipe notation] + * \param[in] urange The range being processed. [parameter is omitted in pipe notation] + * \param[in] shape The seqan3::shape that determines how to compute the hash value. + * \param[in] mod_used The mod value to use. + * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. + * \returns A range of `size_t` where each value is the modmer of the resp. window. + * See below for the properties of the returned range. + * \ingroup search_views + * + * \attention + * Be aware of the requirements of the seqan3::views::kmer_hash view. + * + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *lost* | + * | std::ranges::common_range | | *lost* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | + * + * See the views views submodule documentation for detailed descriptions of the view properties. + * + * \hideinitializer + * + */ +inline constexpr auto modmer_hash = seqan3::detail::modmer_hash_fn{}; + +//!\} diff --git a/include/modmer_hash_distance.hpp b/include/modmer_hash_distance.hpp new file mode 100644 index 0000000..c448ba3 --- /dev/null +++ b/include/modmer_hash_distance.hpp @@ -0,0 +1,142 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \author Mitra Darvish + * \brief Provides modmer_hash. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "modmer.hpp" +#include "shared.hpp" + +namespace seqan3::detail +{ +//!\brief seqan3::views::modmer_hash's range adaptor object type (non-closure). +//!\ingroup search_views +struct modmer_hash_distance_fn +{ + /*!\brief Store the shape and the window size and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] mod_used The mod value to use. + * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, uint32_t const mod_used) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, mod_used}; + } + + /*!\brief Store the shape, the window size and the seed and return a range adaptor closure object. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] mod_used The mod value to use. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. + * \returns A range of converted elements. + */ + constexpr auto operator()(shape const & shape, uint32_t const mod_used, seed const seed) const + { + return seqan3::detail::adaptor_from_functor{*this, shape, mod_used, seed}; + } + + /*!\brief Call the view's constructor with the underlying view, a seqan3::shape and a window size as argument. + * \param[in] urange The input range to process. Must model std::ranges::viewable_range and the reference type + * of the range must model seqan3::semialphabet. + * \param[in] shape The seqan3::shape to use for hashing. + * \param[in] mod_used The mod value to use. + * \param[in] seed The seed to use. + * \throws std::invalid_argument if the size of the shape is greater than the `mod_used`. + * \returns A range of converted elements. + */ + template + constexpr auto operator()(urng_t && urange, + shape const & shape, + uint32_t const mod_used, + seed const seed = seqan3::seed{0x8F3F73B5CF1C9ADE}) const + { + static_assert(std::ranges::viewable_range, + "The range parameter to views::modmer_hash cannot be a temporary of a non-view range."); + static_assert(std::ranges::forward_range, + "The range parameter to views::modmer_hash must model std::ranges::forward_range."); + static_assert(semialphabet>, + "The range parameter to views::modmer_hash must be over elements of seqan3::semialphabet."); + + if (mod_used == 1) // Would just return urange1 without any changes + throw std::invalid_argument{"The chosen mod_used is not valid. " + "Please choose a value greater than 1."}; + + auto forward_strand = std::forward(urange) | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}); + + auto reverse_strand = std::forward(urange) | seqan3::views::complement + | std::views::reverse + | seqan3::views::kmer_hash(shape) + | std::views::transform([seed] (uint64_t i) + {return i ^ seed.get();}) + | std::views::reverse; + + auto combined_strand = seqan3::views::zip(forward_strand, reverse_strand) | std::views::transform([seed](std::tuple i){return fnv_hash(std::get<0>(i) + std::get<1>(i), seed.get());}); + return seqan3::detail::modmer_view(combined_strand, mod_used); + } +}; + +} // namespace seqan3::detail + +/*!\name Alphabet related views + * \{ + */ + +/*!\brief Computes the distance of modmers for a range with a given shape, mod_used and seed. + * \tparam urng_t The type of the range being processed. See below for requirements. [template parameter is + * omitted in pipe notation] + * \param[in] urange The range being processed. [parameter is omitted in pipe notation] + * \param[in] shape The seqan3::shape that determines how to compute the hash value. + * \param[in] mod_used The mod value to use. + * \param[in] seed The seed used to skew the hash values. Default: 0x8F3F73B5CF1C9ADE. + * \returns A range of `size_t` where each value is the modmer of the resp. window. + * See below for the properties of the returned range. + * \ingroup search_views + * + * \attention + * Be aware of the requirements of the seqan3::views::kmer_hash view. + * + * + * ### View properties + * + * | Concepts and traits | `urng_t` (underlying range type) | `rrng_t` (returned range type) | + * |----------------------------------|:----------------------------------:|:--------------------------------:| + * | std::ranges::input_range | *required* | *preserved* | + * | std::ranges::forward_range | *required* | *preserved* | + * | std::ranges::bidirectional_range | | *lost* | + * | std::ranges::random_access_range | | *lost* | + * | std::ranges::contiguous_range | | *lost* | + * | | | | + * | std::ranges::viewable_range | *required* | *guaranteed* | + * | std::ranges::view | | *guaranteed* | + * | std::ranges::sized_range | | *lost* | + * | std::ranges::common_range | | *lost* | + * | std::ranges::output_range | | *lost* | + * | seqan3::const_iterable_range | | *preserved* | + * | | | | + * | std::ranges::range_reference_t | seqan3::semialphabet | std::size_t | + * + * See the views views submodule documentation for detailed descriptions of the view properties. + * + * \hideinitializer + * + */ +inline constexpr auto modmer_hash_distance = seqan3::detail::modmer_hash_distance_fn{}; + +//!\} diff --git a/include/shared.hpp b/include/shared.hpp new file mode 100644 index 0000000..219d54c --- /dev/null +++ b/include/shared.hpp @@ -0,0 +1,29 @@ +#pragma once + +// +/*! \brief Function that ensures random hashes, based on https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + * \param hash_value The hash_value that should be transformed. + * \param seed The seed. + */ +uint64_t fnv_hash(uint64_t hash_value, uint64_t seed) +{ + // If seed is 0, then the hash value is just returned. + if (seed == 0) + return hash_value; + + constexpr static uint64_t default_offset_basis = 0xcbf29ce484222325; + constexpr static uint64_t prime = 0x100000001b3; + + uint64_t hashed = hash_value; + std::ostringstream os; + os << hash_value; + std::string oss = os.str(); + + for (int i = 0; i < oss.size(); i++) + { + hashed = hashed * prime; + hashed= hashed ^ oss[i]; + } + + return hashed; +} diff --git a/lib/seqan3 b/lib/seqan3 index 52f201a..d29786b 160000 --- a/lib/seqan3 +++ b/lib/seqan3 @@ -1 +1 @@ -Subproject commit 52f201a5ef3d806b7348b67afd80374f160451c7 +Subproject commit d29786b61de73f14eed5c83c14ef7e02f038bdb1 diff --git a/src/compare.cpp b/src/compare.cpp index 5477f16..f380bf0 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -7,6 +7,9 @@ #include #include "compare.h" +#include "minimiser_hash_distance.hpp" +#include "modmer_hash.hpp" +#include "modmer_hash_distance.hpp" /*! \brief Calculate mean and variance of given list. * \param results The vector from which mean and varaince should be calculated of. @@ -165,8 +168,8 @@ void compare_cov(std::filesystem::path sequence_file, urng_t kmer_view, urng_t2 std::vector covereage_avg{}; std::ofstream outfile; - seqan3::sequence_file_input> fin{sequence_file}; - for (auto & [seq] : fin) + seqan3::sequence_file_input> fin{sequence_file}; + for (auto & [id, seq] : fin) { auto kmers = seq | kmer_view; auto submers = seq | input_view; @@ -179,6 +182,7 @@ void compare_cov(std::filesystem::path sequence_file, urng_t kmer_view, urng_t2 submers2.push_back(sub); coverage(kmers, submers2, covs, args.shape); + int i{0}; for(auto & elem : covs) { if (elem == 0) @@ -189,11 +193,22 @@ void compare_cov(std::filesystem::path sequence_file, urng_t kmer_view, urng_t2 { if (island > 0) { + if (island >23) + std::cout << i << " " << island << ", " << id<< "\n"; islands.push_back(island); island = 0; } covered++; } + i++; + } + + if (island > 0) + { + if (island >23) + std::cout << i << " " << island << ", " << id<< "\n"; + islands.push_back(island); + island = 0; } covered_percentage.push_back(covered); @@ -239,6 +254,28 @@ void compare_cov(std::filesystem::path sequence_file, urng_t kmer_view, urng_t2 outfile.close(); } +template +void compare_cov2(std::filesystem::path sequence_file, urng_t distance_view, std::string method_name, range_arguments & args) +{ + std::vector coverage{}; + std::vector stdev{}; + std::ofstream outfile; + + seqan3::sequence_file_input> fin{sequence_file}; + for (auto & [seq] : fin) + { + for (auto && hash : seq | distance_view) + coverage.push_back(hash); + } + double mean_coverage, stdev_coverage; + get_mean_and_var(coverage, mean_coverage, stdev_coverage); + + // Store speed and compression + outfile.open(std::string{args.path_out} + method_name + "_coverage.out"); + outfile << "COV\t"<< method_name << "\t" << *std::min_element(coverage.begin(), coverage.end()) << "\t" << mean_coverage << "\t" << stdev_coverage << "\t" << *std::max_element(coverage.begin(), coverage.end()) << "\n"; + outfile.close(); +} + void do_comparison(std::vector sequence_files, range_arguments & args) { switch(args.name) @@ -246,7 +283,10 @@ void do_comparison(std::vector sequence_files, range_argu case kmer: compare(sequence_files, seqan3::views::kmer_hash(args.shape), "kmer_hash_"+std::to_string(args.k_size), args); break; case minimiser: compare(sequence_files, seqan3::views::minimiser_hash(args.shape, - args.w_size), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); + args.w_size, args.seed_se), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); + break; + case modmers: compare(sequence_files, modmer_hash(args.shape, + args.w_size.get(), args.seed_se), "modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); break; case strobemer: std::ranges::empty_view empty{}; if (args.rand & (args.order == 2)) @@ -268,11 +308,11 @@ void do_coverage(std::filesystem::path sequence_file, range_arguments & args) { switch(args.name) { - case kmer: compare_cov(sequence_file, seqan3::views::kmer_hash(args.shape), seqan3::views::kmer_hash(args.shape), "kmer_hash_"+std::to_string(args.k_size), args); - break; - case minimiser: compare_cov(sequence_file, seqan3::views::minimiser_hash(args.shape, - seqan3::window_size{args.shape.size()}), seqan3::views::minimiser_hash(args.shape, - args.w_size), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); + case minimiser: compare_cov2(sequence_file, minimiser_hash_distance(args.shape, + args.w_size, args.seed_se), "minimiser_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); + break; + case modmers: compare_cov2(sequence_file, modmer_hash_distance(args.shape, + args.w_size.get(), args.seed_se), "modmer_hash_" + std::to_string(args.k_size) + "_" + std::to_string(args.w_size.get()), args); break; } } diff --git a/src/main.cpp b/src/main.cpp index a99b08d..713d8a6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -17,6 +17,8 @@ void string_to_methods(std::string name, methods & m) m = minimiser; else if (name == "strobemer") m = strobemer; + else if (name == "modmer") + m = modmers; }; void read_range_arguments_strobemers(seqan3::argument_parser & parser, range_arguments & args) @@ -55,7 +57,7 @@ int coverage(seqan3::argument_parser & parser) parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size."); std::string method{}; - parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser"}); + parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer"}); read_range_arguments_minimiser(parser, args); @@ -70,16 +72,8 @@ int coverage(seqan3::argument_parser & parser) return -1; } - try - { - string_to_methods(method, args.name); - do_coverage(sequence_file, args); - } - catch (const std::invalid_argument & e) - { - std::cerr << e.what() << std::endl; - return -1; - } + string_to_methods(method, args.name); + do_coverage(sequence_file, args); return 0; } @@ -92,7 +86,7 @@ int speed(seqan3::argument_parser & parser) parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); parser.add_option(args.k_size, 'k', "kmer-size", "Define kmer size."); std::string method{}; - parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "strobemer"}); + parser.add_option(method, '\0', "method", "Pick your method.", seqan3::option_spec::required, seqan3::value_list_validator{"kmer", "minimiser", "modmer", "strobemer"}); read_range_arguments_minimiser(parser, args); read_range_arguments_strobemers(parser, args); @@ -108,16 +102,9 @@ int speed(seqan3::argument_parser & parser) return -1; } - try - { - string_to_methods(method, args.name); - do_comparison(sequence_files, args); - } - catch (const std::invalid_argument & e) - { - std::cerr << e.what() << std::endl; - return -1; - } + string_to_methods(method, args.name); + do_comparison(sequence_files, args); + return 0; } diff --git a/test/api/CMakeLists.txt b/test/api/CMakeLists.txt index f2b634c..73ac436 100644 --- a/test/api/CMakeLists.txt +++ b/test/api/CMakeLists.txt @@ -2,3 +2,9 @@ cmake_minimum_required (VERSION 3.8) add_api_test (comparison_test.cpp) target_use_datasources (comparison_test FILES example1.fasta) + +add_api_test (minimiser_distance_test.cpp) + +add_api_test (modmer_test.cpp) +add_api_test (modmer_hash_test.cpp) +add_api_test (modmer_hash_distance_test.cpp) diff --git a/test/api/comparison_test.cpp b/test/api/comparison_test.cpp index b7c8d03..98a2593 100644 --- a/test/api/comparison_test.cpp +++ b/test/api/comparison_test.cpp @@ -1,6 +1,5 @@ #include -#include #include #include "compare.h" diff --git a/test/api/minimiser_distance_test.cpp b/test/api/minimiser_distance_test.cpp new file mode 100644 index 0000000..9123c93 --- /dev/null +++ b/test/api/minimiser_distance_test.cpp @@ -0,0 +1,245 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../../lib/seqan3/test/unit/range/iterator_test_template.hpp" + +#include "minimiser_hash_distance.hpp" + +using seqan3::operator""_dna4; +using seqan3::operator""_shape; +using result_t = std::vector; + +inline static constexpr auto kmer_view = seqan3::views::kmer_hash(seqan3::ungapped{4}); +inline static constexpr auto rev_kmer_view = seqan3::views::complement | std::views::reverse + | kmer_view + | std::views::reverse; +inline static constexpr auto gapped_kmer_view = seqan3::views::kmer_hash(0b1001_shape); +inline static constexpr auto rev_gapped_kmer_view = seqan3::views::complement | std::views::reverse + | seqan3::views::kmer_hash(0b1001_shape) + | std::views::reverse; +inline static constexpr auto minimiser_view1 = minimiser_distance(1); // kmer_size == window_size +inline static constexpr auto minimiser_no_rev_view = minimiser_distance(5); + +using iterator_type = std::ranges::iterator_t< decltype(std::declval() + | kmer_view + | minimiser_no_rev_view)>; +using two_ranges_iterator_type = std::ranges::iterator_t< decltype(seqan3::detail::minimiser_distance_view{ + std::declval() + | kmer_view, + std::declval() + | rev_kmer_view, + 5})>; + +template <> +struct iterator_fixture : public ::testing::Test +{ + using iterator_tag = std::forward_iterator_tag; + static constexpr bool const_iterable = true; + + seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; + decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; + result_t expected_range{0, 3, 1}; + + decltype(minimiser_distance(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 5)) test_range = + minimiser_distance(vec, 5); +}; + +template <> +struct iterator_fixture : public ::testing::Test +{ + using iterator_tag = std::forward_iterator_tag; + static constexpr bool const_iterable = true; + + seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; + using kmer_hash_view_t = decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})); + + kmer_hash_view_t vec = kmer_view(text); + result_t expected_range{0, 3, 1, 0, 0}; + + using reverse_kmer_hash_view_t = decltype(rev_kmer_view(text)); + + using test_range_t = decltype(seqan3::detail::minimiser_distance_view{kmer_hash_view_t{}, reverse_kmer_hash_view_t{}, 5}); + test_range_t test_range = seqan3::detail::minimiser_distance_view{vec, rev_kmer_view(text), 5}; +}; + +using test_types = ::testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(iterator_fixture, iterator_fixture, test_types, ); + +template +class minimiser_view_properties_test: public ::testing::Test { }; + +using underlying_range_types = ::testing::Types, + std::vector const, + seqan3::bitpacked_sequence, + seqan3::bitpacked_sequence const, + std::list, + std::list const, + std::forward_list, + std::forward_list const>; +TYPED_TEST_SUITE(minimiser_view_properties_test, underlying_range_types, ); + +class minimiser_test : public ::testing::Test +{ +protected: + std::vector text1{"AAAAAAAAAAAAAAAAAAA"_dna4}; + std::vector text1_short{"AAAAAA"_dna4}; + result_t result1{4, 4, 4}; // Same result for ungapped and gapped + result_t result1_short{15}; + + std::vector too_short_text{"AC"_dna4}; + + std::vector text3{"ACGGCGACGTTTAG"_dna4}; + result_t result3_ungapped{0, 3, 1, 0, 0}; // ACGG, CGAC, ACGT, aacg, aaac - lowercase for reverse complement + result_t result3_gapped{0, 4, 0, 0, 0}; // A--G, c--c, A--T, a--g, a--c - "-" for gap + result_t result3_ungapped_no_rev{0, 3, 1}; // ACGG, CGAC, ACGT + result_t result3_gapped_no_rev{0, 3, 1}; // A--G, C--C-, A--T "-" for gap + result_t result3_stop{0, 3}; // For stop at first T + result_t result3_gapped_stop{0, 4}; // A--G, c--c + result_t result3_start{2}; // For start at second A, ungapped and gapped the same + result_t result3_ungapped_no_rev_start{0}; // For start at second A + result_t result3_gapped_no_rev_start{0}; // For start at second A +}; + +template +void compare_types(adaptor_t v) +{ + EXPECT_TRUE(std::ranges::input_range); + EXPECT_TRUE(std::ranges::forward_range); + EXPECT_FALSE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::random_access_range); + EXPECT_TRUE(std::ranges::view); + EXPECT_FALSE(std::ranges::sized_range); + EXPECT_FALSE(std::ranges::common_range); + EXPECT_TRUE(seqan3::const_iterable_range); + EXPECT_FALSE((std::ranges::output_range)); +} + +TYPED_TEST(minimiser_view_properties_test, concepts) +{ + TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, + 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG + + auto v = text | kmer_view | minimiser_no_rev_view; + compare_types(v); + auto v2 = seqan3::detail::minimiser_distance_view{text | kmer_view, text | kmer_view, 5}; + + if constexpr (std::ranges::bidirectional_range) // excludes forward_list + { + auto v3 = seqan3::detail::minimiser_distance_view{text | kmer_view, text | rev_kmer_view, 5}; + compare_types(v3); + } +} + +TYPED_TEST(minimiser_view_properties_test, different_inputs_kmer_hash) +{ + TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, + 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG + result_t ungapped{0, 3, 1, 0, 0}; // ACGT, CGAC, ACGT, aacg, aaac - lowercase for reverse comp. + result_t gapped{0, 4, 0, 0, 0}; // A--T, c--c, A--T, a--g, a--c - "-" for gap + result_t ungapped_no_rev{0, 3, 1}; // ACGT, CGAC, ACGT + result_t gapped_no_rev{0, 3, 1}; // A--T, C--C, A--T - "-" for gap + EXPECT_RANGE_EQ(ungapped_no_rev, text | kmer_view | minimiser_no_rev_view); + EXPECT_RANGE_EQ(gapped_no_rev, text | gapped_kmer_view | minimiser_no_rev_view); + + if constexpr (std::ranges::bidirectional_range) // excludes forward_list + { + EXPECT_RANGE_EQ(ungapped, (seqan3::detail::minimiser_distance_view{text | kmer_view, text | rev_kmer_view, 5})) ; + EXPECT_RANGE_EQ(gapped, (seqan3::detail::minimiser_distance_view{text | gapped_kmer_view, text | rev_gapped_kmer_view, 5})); + } +} + +TEST_F(minimiser_test, ungapped_kmer_hash) +{ + EXPECT_RANGE_EQ(result1, (seqan3::detail::minimiser_distance_view{text1 | kmer_view, text1 | rev_kmer_view, 5})); + EXPECT_RANGE_EQ(result1, text1 | kmer_view | minimiser_no_rev_view); + EXPECT_THROW(text1_short | kmer_view | minimiser_view1, std::invalid_argument); + auto empty_view = seqan3::detail::minimiser_distance_view{too_short_text | kmer_view, too_short_text | rev_kmer_view, 5}; + EXPECT_TRUE(std::ranges::empty(empty_view)); + auto empty_view2 = too_short_text | kmer_view | minimiser_no_rev_view; + EXPECT_TRUE(std::ranges::empty(empty_view2)); + EXPECT_RANGE_EQ(result3_ungapped, (seqan3::detail::minimiser_distance_view{text3 | kmer_view, text3 | rev_kmer_view, 5})); + EXPECT_RANGE_EQ(result3_ungapped_no_rev, text3 | kmer_view | minimiser_no_rev_view); + EXPECT_THROW((text3 | minimiser_hash_distance(seqan3::ungapped{4}, seqan3::window_size{3})), std::invalid_argument); + +} + +TEST_F(minimiser_test, gapped_kmer_hash) +{ + EXPECT_RANGE_EQ(result1, (seqan3::detail::minimiser_distance_view{text1 | gapped_kmer_view, + text1 | rev_gapped_kmer_view, + 5})); + EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | minimiser_no_rev_view); + EXPECT_THROW(text1_short | gapped_kmer_view | minimiser_view1, std::invalid_argument); + auto empty_view = seqan3::detail::minimiser_distance_view{too_short_text | gapped_kmer_view, + too_short_text | rev_gapped_kmer_view, + 5}; + EXPECT_TRUE(std::ranges::empty(empty_view)); + auto empty_view2 = too_short_text | gapped_kmer_view | minimiser_no_rev_view; + EXPECT_TRUE(std::ranges::empty(empty_view2)); + EXPECT_RANGE_EQ(result3_gapped, (seqan3::detail::minimiser_distance_view{text3 | gapped_kmer_view, + text3 | rev_gapped_kmer_view, + 5})); + EXPECT_RANGE_EQ(result3_gapped_no_rev, text3 | gapped_kmer_view | minimiser_no_rev_view); + EXPECT_THROW((text3 | minimiser_hash_distance(0b1001_shape, seqan3::window_size{3})), std::invalid_argument); +} + +TEST_F(minimiser_test, window_too_big) +{ + EXPECT_RANGE_EQ(result1_short, text1 | kmer_view | minimiser_distance(20)); + EXPECT_RANGE_EQ(result1_short, text1 | gapped_kmer_view | minimiser_distance(20)); + EXPECT_RANGE_EQ(result1_short, (seqan3::detail::minimiser_distance_view{text1 | kmer_view, text1 | rev_kmer_view, 20})); + EXPECT_RANGE_EQ(result1_short, (seqan3::detail::minimiser_distance_view{text1 | gapped_kmer_view, + text1 | rev_gapped_kmer_view, + 20})); +} + +TEST_F(minimiser_test, combinability) +{ + auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); + EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | kmer_view | minimiser_no_rev_view); + EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | gapped_kmer_view | minimiser_no_rev_view); + + EXPECT_RANGE_EQ(result3_stop, (seqan3::detail::minimiser_distance_view{text3 | stop_at_t | kmer_view, + text3 | stop_at_t | rev_kmer_view, + 5})); + EXPECT_RANGE_EQ(result3_gapped_stop, (seqan3::detail::minimiser_distance_view{text3 | stop_at_t | gapped_kmer_view, + text3 | stop_at_t | rev_gapped_kmer_view, + 5})); + + auto start_at_a = std::views::drop(6); + EXPECT_RANGE_EQ(result3_start, (seqan3::detail::minimiser_distance_view{text3 | start_at_a | kmer_view, + text3 | start_at_a | rev_kmer_view, + 5})); + EXPECT_RANGE_EQ(result3_start, (seqan3::detail::minimiser_distance_view{text3 | start_at_a | gapped_kmer_view, + text3 | start_at_a | rev_gapped_kmer_view, + 5})); +} + +/*TEST_F(minimiser_test, non_arithmetic_value) +{ + // just compute the minimizer directly on the alphabet + EXPECT_RANGE_EQ("ACACA"_dna4, text3 | minimiser_no_rev_view); +}*/ + +TEST_F(minimiser_test, two_ranges_unequal_size) +{ + EXPECT_THROW((seqan3::detail::minimiser_distance_view{text1 | kmer_view, text3 | rev_kmer_view, 5}), std::invalid_argument); +} diff --git a/test/api/modmer_hash_distance_test.cpp b/test/api/modmer_hash_distance_test.cpp new file mode 100644 index 0000000..4260f7d --- /dev/null +++ b/test/api/modmer_hash_distance_test.cpp @@ -0,0 +1,130 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "../../lib/seqan3/test/unit/range/iterator_test_template.hpp" + +#include "modmer_hash_distance.hpp" + +using seqan3::operator""_dna4; +using seqan3::operator""_shape; +using result_t = std::vector; + +using iterator_type = std::ranges::iterator_t() + | modmer_hash_distance(seqan3::ungapped{4}, + 2, + seqan3::seed{0}))>; + +static constexpr seqan3::shape ungapped_shape = seqan3::ungapped{4}; +static constexpr seqan3::shape gapped_shape = 0b1001_shape; +static constexpr auto ungapped_view = modmer_hash_distance(ungapped_shape, + 2, + seqan3::seed{0}); +static constexpr auto gapped_view = modmer_hash_distance(gapped_shape, + 2, + seqan3::seed{0}); + +template <> +struct iterator_fixture : public ::testing::Test +{ + using iterator_tag = std::forward_iterator_tag; + static constexpr bool const_iterable = false; + + seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; + result_t expected_range{6, 1, 0, 0}; + + using test_range_t = decltype(text | ungapped_view); + test_range_t test_range = text | ungapped_view; +}; + +using test_type = ::testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(iterator_fixture, iterator_fixture, test_type, ); + +template +class modmer_hash_distance_view_properties_test: public ::testing::Test { }; + +using underlying_range_types = ::testing::Types, + std::vector const, + seqan3::bitpacked_sequence, + seqan3::bitpacked_sequence const, + std::list, + std::list const>; + +TYPED_TEST_SUITE(modmer_hash_distance_view_properties_test, underlying_range_types, ); + +class modmer_hash_distance_test : public ::testing::Test +{ +protected: + std::vector text1{"AAAAAA"_dna4}; + result_t result1{}; // Same result for ungapped and gapped + + std::vector too_short_text{"AC"_dna4}; + + // ACGG CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG + // CCGT GCCG CGCC TCGC GTCG CGTC ACGT AACG AAAC TAAA CTAA + // ACGG CGGC cgcc GCGA CGAC cgtc ACGT aacg aaac taaa ctaa + std::vector text3{"ACGGCGACGTTTAG"_dna4}; + result_t result3{6, 1, 0, 0}; // ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA // A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap + result_t result3_stop{}; + result_t result3_start{0, 1, 0, 0}; +}; + +template +void compare_types(adaptor_t v) +{ + EXPECT_TRUE(std::ranges::input_range); + EXPECT_TRUE(std::ranges::forward_range); + EXPECT_FALSE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::random_access_range); + EXPECT_TRUE(std::ranges::view); + EXPECT_FALSE(std::ranges::sized_range); + EXPECT_FALSE(std::ranges::common_range); + EXPECT_TRUE(seqan3::const_iterable_range); + EXPECT_FALSE((std::ranges::output_range)); +} + +TYPED_TEST(modmer_hash_distance_view_properties_test, different_input_ranges) +{ + TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, + 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG + result_t ungapped{0, 2, 2, 1, 0, 0}; // ACGT/ACGT, TCGA/TCGA, ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA + result_t gapped{0, 2, 2, 1, 0, 0}; // A--T/A--T, T--A/T--A, A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap + EXPECT_RANGE_EQ(ungapped, text | ungapped_view); + EXPECT_RANGE_EQ(gapped, text | gapped_view); +} + +TEST_F(modmer_hash_distance_test, ungapped) +{ + EXPECT_RANGE_EQ(result1, text1 | ungapped_view); + EXPECT_TRUE(std::ranges::empty(too_short_text | ungapped_view)); + EXPECT_RANGE_EQ(result3, text3 | ungapped_view); +} + +TEST_F(modmer_hash_distance_test, gapped) +{ + EXPECT_RANGE_EQ(result1, text1 | gapped_view); + EXPECT_TRUE(std::ranges::empty(too_short_text | gapped_view)); + EXPECT_RANGE_EQ(result3, text3 | gapped_view); +} + +TEST_F(modmer_hash_distance_test, combinability) +{ + auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); + EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | ungapped_view); + EXPECT_RANGE_EQ(result3_stop, text3 | stop_at_t | gapped_view); + + auto start_at_a = std::views::drop(6); + EXPECT_RANGE_EQ(result3_start, text3 | start_at_a | ungapped_view); + EXPECT_RANGE_EQ(result3_start, text3 | start_at_a | gapped_view); +} diff --git a/test/api/modmer_hash_test.cpp b/test/api/modmer_hash_test.cpp new file mode 100644 index 0000000..2294770 --- /dev/null +++ b/test/api/modmer_hash_test.cpp @@ -0,0 +1,138 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "../../lib/seqan3/test/unit/range/iterator_test_template.hpp" + +#include "modmer_hash.hpp" + +using seqan3::operator""_dna4; +using seqan3::operator""_shape; +using result_t = std::vector; + +using iterator_type = std::ranges::iterator_t() + | modmer_hash(seqan3::ungapped{4}, + 2, + seqan3::seed{0}))>; + +static constexpr seqan3::shape ungapped_shape = seqan3::ungapped{4}; +static constexpr seqan3::shape gapped_shape = 0b1001_shape; +static constexpr auto ungapped_view = modmer_hash(ungapped_shape, + 2, + seqan3::seed{0}); +static constexpr auto ungapped_3_view = modmer_hash(ungapped_shape, + 3, + seqan3::seed{0}); +static constexpr auto gapped_view = modmer_hash(gapped_shape, + 2, + seqan3::seed{0}); + +template <> +struct iterator_fixture : public ::testing::Test +{ + using iterator_tag = std::forward_iterator_tag; + static constexpr bool const_iterable = false; + + seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; + result_t expected_range{27+27, 191+1, 252+192, 242+112}; + + using test_range_t = decltype(text | ungapped_view); + test_range_t test_range = text | ungapped_view; +}; + +using test_type = ::testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(iterator_fixture, iterator_fixture, test_type, ); + +template +class modmer_hash_view_properties_test: public ::testing::Test { }; + +using underlying_range_types = ::testing::Types, + std::vector const, + seqan3::bitpacked_sequence, + seqan3::bitpacked_sequence const, + std::list, + std::list const>; + +TYPED_TEST_SUITE(modmer_hash_view_properties_test, underlying_range_types, ); + +class modmer_hash_test : public ::testing::Test +{ +protected: + std::vector text1{"AAAAAA"_dna4}; + result_t result1{}; // Same result for ungapped and gapped + + std::vector too_short_text{"AC"_dna4}; + + // ACGG CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG + // CCGT GCCG CGCC TCGC GTCG CGTC ACGT AACG AAAC TAAA CTAA + // ACGG CGGC cgcc GCGA CGAC cgtc ACGT aacg aaac taaa ctaa + std::vector text3{"ACGGCGACGTTTAG"_dna4}; + result_t result3_ungapped{27+27, 191+1, 252+192, 242+112}; // ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA + result_t result3_gapped{3+3, 11+1, 12+12, 14+4}; // A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap + result_t result3_ungapped_3{117, 255, 267, 369, 279, 243, 27+27, 117, 191+1, 252+192, 242+112}; +}; + +template +void compare_types(adaptor_t v) +{ + EXPECT_TRUE(std::ranges::input_range); + EXPECT_TRUE(std::ranges::forward_range); + EXPECT_FALSE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::random_access_range); + EXPECT_TRUE(std::ranges::view); + EXPECT_FALSE(std::ranges::sized_range); + EXPECT_FALSE(std::ranges::common_range); + EXPECT_TRUE(seqan3::const_iterable_range); + EXPECT_FALSE((std::ranges::output_range)); +} + +TYPED_TEST(modmer_hash_view_properties_test, different_input_ranges) +{ + TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, + 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG + result_t ungapped{27+27,216+216, 27+27, 191+1, 252+192, 242+112}; // ACGT/ACGT, TCGA/TCGA, ACGT/ACGT, GTTT/AAAC, TTTA/TAAA, TTAG/CTAA + result_t gapped{3+3, 12+12, 3+3, 11+1, 12+12, 14+4}; // A--T/A--T, T--A/T--A, A--T/A--T, G--T/A--C, T--A/T--A, T--G/C--A - "-" for gap + EXPECT_RANGE_EQ(ungapped, text | ungapped_view); + EXPECT_RANGE_EQ(gapped, text | gapped_view); +} + +TEST_F(modmer_hash_test, ungapped) +{ + EXPECT_RANGE_EQ(result1, text1 | ungapped_view); + EXPECT_TRUE(std::ranges::empty(too_short_text | ungapped_view)); + EXPECT_RANGE_EQ(result3_ungapped, text3 | ungapped_view); + EXPECT_NO_THROW(text1 | modmer_hash(ungapped_shape, 2)); + EXPECT_RANGE_EQ(result3_ungapped_3, text3 | ungapped_3_view); + EXPECT_THROW((text3 | modmer_hash(ungapped_shape, 1)), std::invalid_argument); +} + +TEST_F(modmer_hash_test, gapped) +{ + EXPECT_RANGE_EQ(result1, text1 | gapped_view); + EXPECT_TRUE(std::ranges::empty(too_short_text | gapped_view)); + EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_view); + EXPECT_NO_THROW(text1 | modmer_hash(gapped_shape, 2)); + EXPECT_THROW((text3 | modmer_hash(gapped_shape, 1)), std::invalid_argument); +} + +TEST_F(modmer_hash_test, combinability) +{ + auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); + EXPECT_RANGE_EQ(result1, text3 | stop_at_t | ungapped_view); + EXPECT_RANGE_EQ(result1, text3 | stop_at_t | gapped_view); + + auto start_at_a = std::views::drop(6); + EXPECT_RANGE_EQ(result3_ungapped, text3 | start_at_a | ungapped_view); + EXPECT_RANGE_EQ(result3_gapped, text3 | start_at_a | gapped_view); +} diff --git a/test/api/modmer_test.cpp b/test/api/modmer_test.cpp new file mode 100644 index 0000000..7dbbb6b --- /dev/null +++ b/test/api/modmer_test.cpp @@ -0,0 +1,164 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "../../lib/seqan3/test/unit/range/iterator_test_template.hpp" + +#include "modmer.hpp" + +using seqan3::operator""_dna4; +using seqan3::operator""_shape; +using result_t = std::vector; + +inline static constexpr auto kmer_view = seqan3::views::kmer_hash(seqan3::ungapped{4}); +inline static constexpr auto gapped_kmer_view = seqan3::views::kmer_hash(0b1001_shape); + +inline static constexpr auto modmer_view = modmer(2); + +using iterator_type = std::ranges::iterator_t< decltype(std::declval() + | kmer_view + | modmer_view)>; + +template <> +struct iterator_fixture : public ::testing::Test +{ + using iterator_tag = std::forward_iterator_tag; + static constexpr bool const_iterable = true; + + seqan3::dna4_vector text{"ACGGCGACGTTTAG"_dna4}; + decltype(seqan3::views::kmer_hash(text, seqan3::ungapped{4})) vec = text | kmer_view; + result_t expected_range{26, 166, 152, 134, 252, 242}; + + decltype(modmer(seqan3::views::kmer_hash(text, seqan3::ungapped{4}), 2)) test_range = + modmer(vec, 2); +}; + +using test_types = ::testing::Types; +INSTANTIATE_TYPED_TEST_SUITE_P(iterator_fixture, iterator_fixture, test_types, ); + +template +class modmer_view_properties_test: public ::testing::Test { }; + +using underlying_range_types = ::testing::Types, + std::vector const, + seqan3::bitpacked_sequence, + seqan3::bitpacked_sequence const, + std::list, + std::list const, + std::forward_list, + std::forward_list const>; +TYPED_TEST_SUITE(modmer_view_properties_test, underlying_range_types, ); + +class modmer_test : public ::testing::Test +{ +protected: + std::vector text1{"AAAAAA"_dna4}; + result_t result1{0, 0, 0}; // Same result for ungapped and gapped + result_t result1_distance{0, 0, 0}; + + std::vector too_short_text{"AC"_dna4}; + + // ACGG CGGC, GGCG, GCGA, CGAC, GACG, ACGT, CGTT, GTTT, TTTA, TTAG + // CCGT GCCG CGCC TCGC GTCG CGTC ACGT AACG AAAC TAAA CTAA + // ACGG CGGC cgcc GCGA CGAC cgtc ACGT aacg aaac taaa ctaa + std::vector text3{"ACGGCGACGTTTAG"_dna4}; + result_t result3_ungapped{26, 166, 152, 134, 252, 242}; // ACGG, GGCG, GCGA, GACG, TTTA, TTAG + result_t result3_gapped{2, 10, 8, 10, 12, 14}; // A--G, G--G, G--A, G--G, T--A, T--G "-" for gap + result_t result3_distance{0, 1, 0, 1, 3, 0}; + result_t result3_ungapped_stop{26, 166, 152, 134}; // ACGG, GGCG, GCGA, GACG + result_t result3_gapped_stop{2, 10, 8, 10}; + result_t result3_distance_stop{0, 1, 0, 1}; + result_t result3_ungapped_start{252, 242}; // For start at second A, TTTA, TTAG + result_t result3_gapped_start{14}; // For start at second A, T--G + result_t result3_distance_start{3, 0}; +}; + +template +void compare_types(adaptor_t v) +{ + EXPECT_TRUE(std::ranges::input_range); + EXPECT_TRUE(std::ranges::forward_range); + EXPECT_FALSE(std::ranges::bidirectional_range); + EXPECT_FALSE(std::ranges::random_access_range); + EXPECT_TRUE(std::ranges::view); + EXPECT_FALSE(std::ranges::sized_range); + EXPECT_FALSE(std::ranges::common_range); + EXPECT_TRUE(seqan3::const_iterable_range); + EXPECT_FALSE((std::ranges::output_range)); +} + +TYPED_TEST(modmer_view_properties_test, concepts) +{ + TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, + 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG + + auto v = text | kmer_view | modmer_view; + compare_types(v); +} + +TYPED_TEST(modmer_view_properties_test, different_inputs_kmer_hash) +{ + TypeParam text{'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, 'C'_dna4, 'G'_dna4, 'A'_dna4, 'C'_dna4, 'G'_dna4, 'T'_dna4, + 'T'_dna4, 'T'_dna4, 'A'_dna4, 'G'_dna4}; // ACGTCGACGTTTAG + result_t ungapped{182, 216, 134, 252, 242}; // GTCG, TCGA, GACG, TTTA, TTAG + result_t gapped{10, 12, 10, 12, 14}; // G--G, T--G, T--A, G--G, T--A, T--G "-" for gap + EXPECT_RANGE_EQ(ungapped, text | kmer_view | modmer_view); + EXPECT_RANGE_EQ(gapped, text | gapped_kmer_view | modmer_view); +} + +TEST_F(modmer_test, ungapped_kmer_hash) +{ + EXPECT_RANGE_EQ(result1, text1 | kmer_view | modmer_view); + auto empty_view = too_short_text | kmer_view | modmer_view; + EXPECT_TRUE(std::ranges::empty(empty_view)); + EXPECT_RANGE_EQ(result3_ungapped, text3 | kmer_view | modmer_view); + + auto v1 = text1 | kmer_view; + EXPECT_RANGE_EQ(result1_distance, (seqan3::detail::modmer_view(v1, 2))); + auto v2 = text3 | kmer_view; + EXPECT_RANGE_EQ(result3_distance, (seqan3::detail::modmer_view(v2, 2))); +} + +TEST_F(modmer_test, gapped_kmer_hash) +{ + EXPECT_RANGE_EQ(result1, text1 | gapped_kmer_view | modmer_view); + auto empty_view = too_short_text | gapped_kmer_view | modmer_view; + EXPECT_TRUE(std::ranges::empty(empty_view)); + EXPECT_RANGE_EQ(result3_gapped, text3 | gapped_kmer_view | modmer_view); + + auto v1 = text1 | gapped_kmer_view; + EXPECT_RANGE_EQ(result1_distance, (seqan3::detail::modmer_view(v1, 2))); + auto v2 = text3 | gapped_kmer_view; + EXPECT_RANGE_EQ(result3_distance, (seqan3::detail::modmer_view(v2, 2))); +} + +TEST_F(modmer_test, combinability) +{ + auto stop_at_t = std::views::take_while([] (seqan3::dna4 const x) { return x != 'T'_dna4; }); + EXPECT_RANGE_EQ(result3_ungapped_stop, text3 | stop_at_t | kmer_view | modmer_view); + EXPECT_RANGE_EQ(result3_gapped_stop, text3 | stop_at_t | gapped_kmer_view | modmer_view); + + auto v1 = text3 | stop_at_t | kmer_view; + auto v2 = text3 | stop_at_t | kmer_view; + EXPECT_RANGE_EQ(result3_distance_stop, (seqan3::detail::modmer_view(v1, 2))); + EXPECT_RANGE_EQ(result3_distance_stop, (seqan3::detail::modmer_view(v2, 2))); + + auto start_at_a = std::views::drop(6); + EXPECT_RANGE_EQ(result3_ungapped_start, (seqan3::detail::modmer_view{text3 | start_at_a | kmer_view, 2})); + + auto v3 = text3 | start_at_a | kmer_view; + auto v4 = text3 | start_at_a | gapped_kmer_view; + EXPECT_RANGE_EQ(result3_distance_start, (seqan3::detail::modmer_view(v3, 2))); + EXPECT_RANGE_EQ(result3_distance_start, (seqan3::detail::modmer_view(v4, 2))); +} diff --git a/test/cli/minions_coverage_test.cpp b/test/cli/minions_coverage_test.cpp index 892cf99..557b88c 100644 --- a/test/cli/minions_coverage_test.cpp +++ b/test/cli/minions_coverage_test.cpp @@ -14,25 +14,25 @@ TEST_F(cli_test, no_options) EXPECT_EQ(result.err, std::string{}); } -TEST_F(cli_test, kmer) +TEST_F(cli_test, minimiser) { - cli_test_result result = execute_app("minions coverage --method kmer -k 19", data("example1.fasta")); + cli_test_result result = execute_app("minions coverage --method minimiser -k 19 -w 19 ", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, std::string{}); } -TEST_F(cli_test, gapped_kmer) +TEST_F(cli_test, gapped_minimiser) { - cli_test_result result = execute_app("minions coverage --method kmer --shape 524223", data("example1.fasta")); + cli_test_result result = execute_app("minions coverage --method minimiser -k 19 -w 19 --shape 524223", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, std::string{}); } -TEST_F(cli_test, minimiser) +TEST_F(cli_test, modmer) { - cli_test_result result = execute_app("minions coverage --method minimiser -k 19 -w 19 ", data("example1.fasta")); + cli_test_result result = execute_app("minions coverage --method modmer -k 19 -w 2 ", data("example1.fasta")); EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{}); EXPECT_EQ(result.err, std::string{}); @@ -44,7 +44,7 @@ TEST_F(cli_test, wrong_method) std::string expected { "Error. Incorrect command line input for coverage. Validation failed " - "for option --method: Value submer is not one of [kmer,minimiser].\n" + "for option --method: Value submer is not one of [kmer,minimiser,modmer].\n" }; EXPECT_EQ(result.exit_code, 0); diff --git a/test/cli/minions_speed_test.cpp b/test/cli/minions_speed_test.cpp index 60dccaf..3f70ab7 100644 --- a/test/cli/minions_speed_test.cpp +++ b/test/cli/minions_speed_test.cpp @@ -30,6 +30,14 @@ TEST_F(cli_test, minimiser) EXPECT_EQ(result.err, std::string{}); } +TEST_F(cli_test, modmer) +{ + cli_test_result result = execute_app("minions speed --method modmer -k 19 -w 2 ", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + TEST_F(cli_test, strobemer) { cli_test_result result = execute_app("minions speed --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --randstrobemers", data("example1.fasta")); @@ -38,13 +46,29 @@ TEST_F(cli_test, strobemer) EXPECT_EQ(result.err, std::string{}); } +TEST_F(cli_test, hybridstrobemer) +{ + cli_test_result result = execute_app("minions speed --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --hybrid", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + +TEST_F(cli_test, minstrobers) +{ + cli_test_result result = execute_app("minions speed --method strobemer -k 19 --w-min 16 --w-max 30 --order 2 --minstrobers", data("example1.fasta")); + EXPECT_EQ(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{}); +} + TEST_F(cli_test, wrong_method) { cli_test_result result = execute_app("minions speed --method submer -k 19", data("example1.fasta")); std::string expected { "Error. Incorrect command line input for speed. Validation failed " - "for option --method: Value submer is not one of [kmer,minimiser,strobemer].\n" + "for option --method: Value submer is not one of [kmer,minimiser,modmer,strobemer].\n" }; EXPECT_EQ(result.exit_code, 0); EXPECT_EQ(result.out, std::string{});