From 3e53eee88aa96910bcccc6cbab09e6455a4ebc38 Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Fri, 18 Oct 2024 23:01:59 +0200 Subject: [PATCH] Update tantan --- CMakeLists.txt | 4 + lib/tantan/CMakeLists.txt | 2 + lib/tantan/mcf_simd.h | 538 ++++++++++++++++++++++++++ lib/tantan/tantan.cpp | 549 +++++++++++++++++++++++++++ {src/commons => lib/tantan}/tantan.h | 56 ++- src/CMakeLists.txt | 2 +- src/alignment/PSSMMasker.h | 6 +- src/commons/BaseMatrix.h | 2 +- src/commons/CMakeLists.txt | 2 - src/commons/tantan.cpp | 506 ------------------------ src/linclust/kmermatcher.cpp | 4 +- src/prefiltering/IndexBuilder.cpp | 5 +- src/util/masksequence.cpp | 4 +- 13 files changed, 1140 insertions(+), 540 deletions(-) create mode 100644 lib/tantan/CMakeLists.txt create mode 100644 lib/tantan/mcf_simd.h create mode 100644 lib/tantan/tantan.cpp rename {src/commons => lib/tantan}/tantan.h (68%) delete mode 100644 src/commons/tantan.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index adde42630..c0d920427 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -205,6 +205,10 @@ add_subdirectory(lib/tinyexpr EXCLUDE_FROM_ALL) include_directories(lib/microtar) add_subdirectory(lib/microtar) +# tantan +include_directories(lib/tantan) +add_subdirectory(lib/tantan) + # simde include_directories(lib/simde) diff --git a/lib/tantan/CMakeLists.txt b/lib/tantan/CMakeLists.txt new file mode 100644 index 000000000..b4746a56c --- /dev/null +++ b/lib/tantan/CMakeLists.txt @@ -0,0 +1,2 @@ +add_library(tantan tantan.cpp tantan.h mcf_simd.h) +set_target_properties(tantan PROPERTIES COMPILE_FLAGS "${MMSEQS_C_FLAGS}" LINK_FLAGS "${MMSEQS_C_FLAGS}") diff --git a/lib/tantan/mcf_simd.h b/lib/tantan/mcf_simd.h new file mode 100644 index 000000000..c5dd4687a --- /dev/null +++ b/lib/tantan/mcf_simd.h @@ -0,0 +1,538 @@ +// Author: Martin C. Frith 2019 +// SPDX-License-Identifier: MPL-2.0 + +#ifndef MCF_SIMD_HH +#define MCF_SIMD_HH + +#if defined __SSE4_1__ +#include +#elif defined __ARM_NEON +#include +#endif + +#include // size_t + +namespace mcf { + +#if defined __AVX2__ + +typedef __m256i SimdInt; +typedef __m256i SimdUint1; +typedef __m256d SimdDbl; + +const int simdBytes = 32; + +static inline SimdInt simdZero() { + return _mm256_setzero_si256(); +} + +static inline SimdInt simdZero1() { + return _mm256_setzero_si256(); +} + +static inline SimdDbl simdZeroDbl() { + return _mm256_setzero_pd(); +} + +static inline SimdInt simdOnes1() { + return _mm256_set1_epi32(-1); +} + +static inline SimdInt simdLoad(const void *p) { + return _mm256_loadu_si256((const SimdInt *)p); +} + +static inline SimdInt simdLoad1(const void *p) { + return _mm256_loadu_si256((const SimdInt *)p); +} + +static inline SimdDbl simdLoadDbl(const double *p) { + return _mm256_loadu_pd(p); +} + +static inline void simdStore(void *p, SimdInt x) { + _mm256_storeu_si256((SimdInt *)p, x); +} + +static inline void simdStore1(void *p, SimdInt x) { + _mm256_storeu_si256((SimdInt *)p, x); +} + +static inline void simdStoreDbl(double *p, SimdDbl x) { + _mm256_storeu_pd(p, x); +} + +static inline SimdInt simdOr1(SimdInt x, SimdInt y) { + return _mm256_or_si256(x, y); +} + +static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) { + return _mm256_blendv_epi8(x, y, mask); +} + +const int simdLen = 8; +const int simdDblLen = 4; + +static inline SimdInt simdSet(int i7, int i6, int i5, int i4, + int i3, int i2, int i1, int i0) { + return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0); +} + +static inline SimdInt simdSet1(char jF, char jE, char jD, char jC, + char jB, char jA, char j9, char j8, + char j7, char j6, char j5, char j4, + char j3, char j2, char j1, char j0, + char iF, char iE, char iD, char iC, + char iB, char iA, char i9, char i8, + char i7, char i6, char i5, char i4, + char i3, char i2, char i1, char i0) { + return _mm256_set_epi8(jF, jE, jD, jC, jB, jA, j9, j8, + j7, j6, j5, j4, j3, j2, j1, j0, + iF, iE, iD, iC, iB, iA, i9, i8, + i7, i6, i5, i4, i3, i2, i1, i0); +} + +static inline SimdDbl simdSetDbl(double i3, double i2, double i1, double i0) { + return _mm256_set_pd(i3, i2, i1, i0); +} + +static inline SimdInt simdFill(int x) { + return _mm256_set1_epi32(x); +} + +static inline SimdInt simdFill1(char x) { + return _mm256_set1_epi8(x); +} + +static inline SimdDbl simdFillDbl(double x) { + return _mm256_set1_pd(x); +} + +static inline SimdInt simdGt(SimdInt x, SimdInt y) { + return _mm256_cmpgt_epi32(x, y); +} + +static inline SimdInt simdGe1(SimdInt x, SimdInt y) { + return _mm256_cmpeq_epi8(_mm256_min_epu8(x, y), y); +} + +static inline SimdInt simdAdd(SimdInt x, SimdInt y) { + return _mm256_add_epi32(x, y); +} + +static inline SimdInt simdAdd1(SimdInt x, SimdInt y) { + return _mm256_add_epi8(x, y); +} + +static inline SimdInt simdAdds1(SimdInt x, SimdInt y) { + return _mm256_adds_epu8(x, y); +} + +static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) { + return _mm256_add_pd(x, y); +} + +static inline SimdInt simdSub(SimdInt x, SimdInt y) { + return _mm256_sub_epi32(x, y); +} + +static inline SimdInt simdSub1(SimdInt x, SimdInt y) { + return _mm256_sub_epi8(x, y); +} + +static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) { + return _mm256_mul_pd(x, y); +} + +static inline SimdInt simdQuadruple1(SimdInt x) { + return _mm256_slli_epi32(x, 2); +} + +static inline SimdInt simdMax(SimdInt x, SimdInt y) { + return _mm256_max_epi32(x, y); +} + +static inline SimdInt simdMin1(SimdInt x, SimdInt y) { + return _mm256_min_epu8(x, y); +} + +static inline int simdHorizontalMax(SimdInt x) { + __m128i z = _mm256_castsi256_si128(x); + z = _mm_max_epi32(z, _mm256_extracti128_si256(x, 1)); + z = _mm_max_epi32(z, _mm_shuffle_epi32(z, 0x4E)); + z = _mm_max_epi32(z, _mm_shuffle_epi32(z, 0xB1)); + return _mm_cvtsi128_si32(z); +} + +static inline int simdHorizontalMin1(SimdInt x) { + __m128i z = _mm256_castsi256_si128(x); + z = _mm_min_epu8(z, _mm256_extracti128_si256(x, 1)); + z = _mm_min_epu8(z, _mm_srli_epi16(z, 8)); + z = _mm_minpos_epu16(z); + return _mm_extract_epi16(z, 0); +} + +static inline double simdHorizontalAddDbl(SimdDbl x) { + __m128d z = _mm256_castpd256_pd128(x); + z = _mm_add_pd(z, _mm256_extractf128_pd(x, 1)); + return _mm_cvtsd_f64(_mm_hadd_pd(z, z)); +} + +static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) { + return _mm256_shuffle_epi8(items, choices); +} + +#elif defined __SSE4_1__ + +typedef __m128i SimdInt; +typedef __m128i SimdUint1; +typedef __m128d SimdDbl; + +const int simdBytes = 16; + +static inline SimdInt simdZero() { + return _mm_setzero_si128(); +} + +static inline SimdInt simdZero1() { + return _mm_setzero_si128(); +} + +static inline SimdDbl simdZeroDbl() { + return _mm_setzero_pd(); +} + +static inline SimdInt simdOnes1() { + return _mm_set1_epi32(-1); +} + +static inline SimdInt simdLoad(const void *p) { + return _mm_loadu_si128((const SimdInt *)p); +} + +static inline SimdInt simdLoad1(const void *p) { + return _mm_loadu_si128((const SimdInt *)p); +} + +static inline SimdDbl simdLoadDbl(const double *p) { + return _mm_loadu_pd(p); +} + +static inline void simdStore(void *p, SimdInt x) { + _mm_storeu_si128((SimdInt *)p, x); +} + +static inline void simdStore1(void *p, SimdInt x) { + _mm_storeu_si128((SimdInt *)p, x); +} + +static inline void simdStoreDbl(double *p, SimdDbl x) { + _mm_storeu_pd(p, x); +} + +static inline SimdInt simdOr1(SimdInt x, SimdInt y) { + return _mm_or_si128(x, y); +} + +static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) { + return _mm_blendv_epi8(x, y, mask); // SSE4.1 +} + +const int simdLen = 4; +const int simdDblLen = 2; + +static inline SimdInt simdSet(int i3, int i2, int i1, int i0) { + return _mm_set_epi32(i3, i2, i1, i0); +} + +static inline SimdInt simdSet1(char iF, char iE, char iD, char iC, + char iB, char iA, char i9, char i8, + char i7, char i6, char i5, char i4, + char i3, char i2, char i1, char i0) { + return _mm_set_epi8(iF, iE, iD, iC, iB, iA, i9, i8, + i7, i6, i5, i4, i3, i2, i1, i0); +} + +static inline SimdDbl simdSetDbl(double i1, double i0) { + return _mm_set_pd(i1, i0); +} + +static inline SimdInt simdFill(int x) { + return _mm_set1_epi32(x); +} + +static inline SimdInt simdFill1(char x) { + return _mm_set1_epi8(x); +} + +static inline SimdDbl simdFillDbl(double x) { + return _mm_set1_pd(x); +} + +static inline SimdInt simdGt(SimdInt x, SimdInt y) { + return _mm_cmpgt_epi32(x, y); +} + +static inline SimdInt simdGe1(SimdInt x, SimdInt y) { + return _mm_cmpeq_epi8(_mm_min_epu8(x, y), y); +} + +static inline SimdInt simdAdd(SimdInt x, SimdInt y) { + return _mm_add_epi32(x, y); +} + +static inline SimdInt simdAdd1(SimdInt x, SimdInt y) { + return _mm_add_epi8(x, y); +} + +static inline SimdInt simdAdds1(SimdInt x, SimdInt y) { + return _mm_adds_epu8(x, y); +} + +static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) { + return _mm_add_pd(x, y); +} + +static inline SimdInt simdSub(SimdInt x, SimdInt y) { + return _mm_sub_epi32(x, y); +} + +static inline SimdInt simdSub1(SimdInt x, SimdInt y) { + return _mm_sub_epi8(x, y); +} + +static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) { + return _mm_mul_pd(x, y); +} + +static inline SimdInt simdQuadruple1(SimdInt x) { + return _mm_slli_epi32(x, 2); +} + +static inline SimdInt simdMax(SimdInt x, SimdInt y) { + return _mm_max_epi32(x, y); // SSE4.1 +} + +static inline SimdInt simdMin1(SimdInt x, SimdInt y) { + return _mm_min_epu8(x, y); +} + +static inline int simdHorizontalMax(SimdInt x) { + x = simdMax(x, _mm_shuffle_epi32(x, 0x4E)); + x = simdMax(x, _mm_shuffle_epi32(x, 0xB1)); + return _mm_cvtsi128_si32(x); +} + +static inline int simdHorizontalMin1(SimdInt x) { + x = _mm_min_epu8(x, _mm_srli_epi16(x, 8)); + x = _mm_minpos_epu16(x); // SSE4.1 + return _mm_extract_epi16(x, 0); +} + +static inline double simdHorizontalAddDbl(SimdDbl x) { + return _mm_cvtsd_f64(_mm_hadd_pd(x, x)); +} + +static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) { + return _mm_shuffle_epi8(items, choices); // SSSE3 +} + +#elif defined __ARM_NEON + +typedef int32x4_t SimdInt; +typedef uint32x4_t SimdUint; +typedef uint8x16_t SimdUint1; +typedef float64x2_t SimdDbl; + +const int simdBytes = 16; + +static inline SimdInt simdZero() { + return vdupq_n_s32(0); +} + +static inline SimdUint1 simdZero1() { + return vdupq_n_u8(0); +} + +static inline SimdDbl simdZeroDbl() { + return vdupq_n_f64(0); +} + +static inline SimdUint1 simdOnes1() { + return vdupq_n_u8(-1); +} + +static inline SimdInt simdLoad(const int *p) { + return vld1q_s32(p); +} + +static inline SimdUint1 simdLoad1(const unsigned char *p) { + return vld1q_u8(p); +} + +static inline SimdDbl simdLoadDbl(const double *p) { + return vld1q_f64(p); +} + +static inline void simdStore(int *p, SimdInt x) { + vst1q_s32(p, x); +} + +static inline void simdStore1(unsigned char *p, SimdUint1 x) { + vst1q_u8(p, x); +} + +static inline void simdStoreDbl(double *p, SimdDbl x) { + vst1q_f64(p, x); +} + +static inline SimdUint1 simdOr1(SimdUint1 x, SimdUint1 y) { + return vorrq_u8(x, y); +} + +static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdUint mask) { + return vbslq_s32(mask, y, x); +} + +const int simdLen = 4; +const int simdDblLen = 2; + +static inline SimdInt simdSet(unsigned i3, unsigned i2, + unsigned i1, unsigned i0) { + size_t lo = i1; + size_t hi = i3; + return + vcombine_s32(vcreate_s32((lo << 32) | i0), vcreate_s32((hi << 32) | i2)); +} + +static inline SimdUint1 simdSet1(unsigned char iF, unsigned char iE, + unsigned char iD, unsigned char iC, + unsigned char iB, unsigned char iA, + unsigned char i9, unsigned char i8, + unsigned char i7, unsigned char i6, + unsigned char i5, unsigned char i4, + unsigned char i3, unsigned char i2, + unsigned char i1, unsigned char i0) { + size_t lo = + (size_t)i0 | (size_t)i1 << 8 | (size_t)i2 << 16 | (size_t)i3 << 24 | + (size_t)i4 << 32 | (size_t)i5 << 40 | (size_t)i6 << 48 | (size_t)i7 << 56; + + size_t hi = + (size_t)i8 | (size_t)i9 << 8 | (size_t)iA << 16 | (size_t)iB << 24 | + (size_t)iC << 32 | (size_t)iD << 40 | (size_t)iE << 48 | (size_t)iF << 56; + + return vcombine_u8(vcreate_u8(lo), vcreate_u8(hi)); +} + +static inline SimdDbl simdSetDbl(double i1, double i0) { + return vcombine_f64(vdup_n_f64(i0), vdup_n_f64(i1)); +} + +static inline SimdInt simdFill(int x) { + return vdupq_n_s32(x); +} + +static inline SimdUint1 simdFill1(unsigned char x) { + return vdupq_n_u8(x); +} + +static inline SimdDbl simdFillDbl(double x) { + return vdupq_n_f64(x); +} + +static inline SimdUint simdGt(SimdInt x, SimdInt y) { + return vcgtq_s32(x, y); +} + +static inline SimdUint1 simdGe1(SimdUint1 x, SimdUint1 y) { + return vcgeq_u8(x, y); +} + +static inline SimdInt simdAdd(SimdInt x, SimdInt y) { + return vaddq_s32(x, y); +} + +static inline SimdUint1 simdAdd1(SimdUint1 x, SimdUint1 y) { + return vaddq_u8(x, y); +} + +static inline SimdUint1 simdAdds1(SimdUint1 x, SimdUint1 y) { + return vqaddq_u8(x, y); +} + +static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) { + return vaddq_f64(x, y); +} + +static inline SimdInt simdSub(SimdInt x, SimdInt y) { + return vsubq_s32(x, y); +} + +static inline SimdUint1 simdSub1(SimdUint1 x, SimdUint1 y) { + return vsubq_u8(x, y); +} + +static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) { + return vmulq_f64(x, y); +} + +static inline SimdUint1 simdQuadruple1(SimdUint1 x) { + return vshlq_n_u8(x, 2); +} + +static inline SimdInt simdMax(SimdInt x, SimdInt y) { + return vmaxq_s32(x, y); +} + +static inline SimdUint1 simdMin1(SimdUint1 x, SimdUint1 y) { + return vminq_u8(x, y); +} + +static inline int simdHorizontalMax(SimdInt x) { + return vmaxvq_s32(x); +} + +static inline int simdHorizontalMin1(SimdUint1 x) { + return vminvq_u8(x); +} + +static inline double simdHorizontalAddDbl(SimdDbl x) { + return vaddvq_f64(x); +} + +static inline SimdUint1 simdChoose1(SimdUint1 items, SimdUint1 choices) { + return vqtbl1q_u8(items, choices); +} + +#else + +typedef int SimdInt; +typedef double SimdDbl; +const int simdBytes = 1; +const int simdLen = 1; +const int simdDblLen = 1; +static inline int simdZero() { return 0; } +static inline double simdZeroDbl() { return 0; } +static inline int simdSet(int x) { return x; } +static inline double simdSetDbl(double x) { return x; } +static inline int simdFill(int x) { return x; } +static inline int simdLoad(const int *p) { return *p; } +static inline double simdLoadDbl(const double *p) { return *p; } +static inline void simdStore(int *p, int x) { *p = x; } +static inline void simdStoreDbl(double *p, double x) { *p = x; } +static inline double simdFillDbl(double x) { return x; } +static inline int simdGt(int x, int y) { return x > y; } +static inline int simdAdd(int x, int y) { return x + y; } +static inline double simdAddDbl(double x, double y) { return x + y; } +static inline int simdSub(int x, int y) { return x - y; } +static inline double simdMulDbl(double x, double y) { return x * y; } +static inline int simdMax(int x, int y) { return x > y ? x : y; } +static inline int simdBlend(int x, int y, int mask) { return mask ? y : x; } +static inline int simdHorizontalMax(int a) { return a; } +static inline double simdHorizontalAddDbl(double x) { return x; } + +#endif + +} + +#endif diff --git a/lib/tantan/tantan.cpp b/lib/tantan/tantan.cpp new file mode 100644 index 000000000..571539d91 --- /dev/null +++ b/lib/tantan/tantan.cpp @@ -0,0 +1,549 @@ +// Author: Martin C. Frith 2010 +// SPDX-License-Identifier: MPL-2.0 + +#include "tantan.h" +#include "mcf_simd.h" + +#include // fill, max +#include +#include // pow, abs +#include // cerr +#include // accumulate +#include + +#define BEG(v) ((v).empty() ? 0 : &(v).front()) +#define END(v) ((v).empty() ? 0 : &(v).back() + 1) + +namespace tantan { + +using namespace mcf; + +void multiplyAll(std::vector &v, double factor) { + for (std::vector::iterator i = v.begin(); i < v.end(); ++i) + *i *= factor; +} + +double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) { + if (probMult < 1 || probMult > 1) { + return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset)); + } + return 1.0 / maxRepeatOffset; +} + +void checkForwardAndBackwardTotals(double fTot, double bTot) { + double x = std::abs(fTot); + double y = std::abs(bTot); + + // ??? Is 1e6 suitable here ??? + if (std::abs(fTot - bTot) > std::max(x, y) / 1e6) + std::cerr << "tantan: warning: possible numeric inaccuracy\n" + << "tantan: forward algorithm total: " << fTot << "\n" + << "tantan: backward algorithm total: " << bTot << "\n"; +} + +struct Tantan { + enum { scaleStepSize = 16 }; + + const uchar *seqBeg; // start of the sequence + const uchar *seqEnd; // end of the sequence + const uchar *seqPtr; // current position in the sequence + + int maxRepeatOffset; + + const const_double_ptr *likelihoodRatioMatrix; + + double b2b; // transition probability from background to background + double f2b; // transition probability from foreground to background + double g2g; // transition probability from gap/indel to gap/indel + //double f2g; // transition probability from foreground to gap/indel + //double g2f; // transition probability from gap/indel to foreground + double oneGapProb; // f2g * g2f + double endGapProb; // f2g * 1 + double f2f0; // foreground to foreground, if there are 0 indel transitions + double f2f1; // foreground to foreground, if there is 1 indel transition + double f2f2; // foreground to foreground, if there are 2 indel transitions + double b2fDecay; + double b2fGrowth; + double b2fFirst; // background state to first foreground state + double b2fLast; // background state to last foreground state + + double backgroundProb; + std::vector b2fProbs; // background state to each foreground state + std::vector foregroundProbs; + std::vector insertionProbs; + + std::vector scaleFactors; + + Tantan(const uchar *seqBeg, + const uchar *seqEnd, + int maxRepeatOffset, + const const_double_ptr *likelihoodRatioMatrix, + double repeatProb, + double repeatEndProb, + double repeatOffsetProbDecay, + double firstGapProb, + double otherGapProb) { + assert(maxRepeatOffset > 0); + assert(repeatProb >= 0 && repeatProb < 1); + // (if repeatProb==1, then any sequence is impossible) + assert(repeatEndProb >= 0 && repeatEndProb <= 1); + assert(repeatOffsetProbDecay > 0 && repeatOffsetProbDecay <= 1); + assert(otherGapProb >= 0 && otherGapProb <= 1); + assert(firstGapProb >= 0); + assert(repeatEndProb + firstGapProb * 2 <= 1); + + this->seqBeg = seqBeg; + this->seqEnd = seqEnd; + this->seqPtr = seqBeg; + this->maxRepeatOffset = maxRepeatOffset; + this->likelihoodRatioMatrix = likelihoodRatioMatrix; + + b2b = 1 - repeatProb; + f2b = repeatEndProb; + g2g = otherGapProb; + //f2g = firstGapProb; + //g2f = 1 - otherGapProb; + oneGapProb = firstGapProb * (1 - otherGapProb); + endGapProb = firstGapProb * (maxRepeatOffset > 1); + f2f0 = 1 - repeatEndProb; + f2f1 = 1 - repeatEndProb - firstGapProb; + f2f2 = 1 - repeatEndProb - firstGapProb * 2; + + b2fDecay = repeatOffsetProbDecay; + b2fGrowth = 1 / repeatOffsetProbDecay; + + b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset); + b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset); + + b2fProbs.resize(maxRepeatOffset); + foregroundProbs.resize(maxRepeatOffset); + insertionProbs.resize(maxRepeatOffset - 1); + + double p = b2fFirst; + for (int i = 0; i < maxRepeatOffset; ++i) { + b2fProbs[i] = p; + p *= b2fDecay; + } + + scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize); + } + + void initializeForwardAlgorithm() { + backgroundProb = 1.0; + std::fill(foregroundProbs.begin(), foregroundProbs.end(), 0.0); + std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0); + } + + double forwardTotal() { + double fromForeground = std::accumulate(foregroundProbs.begin(), + foregroundProbs.end(), 0.0); + double total = backgroundProb * b2b + fromForeground * f2b; + assert(total > 0); + return total; + } + + void initializeBackwardAlgorithm() { + backgroundProb = b2b; + std::fill(foregroundProbs.begin(), foregroundProbs.end(), f2b); + std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0); + } + + double backwardTotal() { + assert(backgroundProb > 0); + return backgroundProb; + } + + void calcForwardTransitionProbsWithGaps() { + double fromBackground = backgroundProb * b2fLast; + double *foregroundPtr = &foregroundProbs.back(); + double f = *foregroundPtr; + double fromForeground = f; + + double *insertionPtr = &insertionProbs.back(); + double i = *insertionPtr; + *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb; + double d = f; + --foregroundPtr; + fromBackground *= b2fGrowth; + + while (foregroundPtr > &foregroundProbs.front()) { + f = *foregroundPtr; + fromForeground += f; + i = *(insertionPtr - 1); + *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb; + *insertionPtr = f + i * g2g; + d = f + d * g2g; + --foregroundPtr; + --insertionPtr; + fromBackground *= b2fGrowth; + } + + f = *foregroundPtr; + fromForeground += f; + *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb; + *insertionPtr = f; + + backgroundProb = backgroundProb * b2b + fromForeground * f2b; + } + + void calcBackwardTransitionProbsWithGaps() { + double toBackground = f2b * backgroundProb; + double *foregroundPtr = &foregroundProbs.front(); + double f = *foregroundPtr; + double toForeground = f; + + double *insertionPtr = &insertionProbs.front(); + double i = *insertionPtr; + *foregroundPtr = toBackground + f2f1 * f + i; + double d = endGapProb * f; + ++foregroundPtr; + toForeground *= b2fGrowth; + + while (foregroundPtr < &foregroundProbs.back()) { + f = *foregroundPtr; + toForeground += f; + i = *(insertionPtr + 1); + *foregroundPtr = toBackground + f2f2 * f + (i + d); + double oneGapProb_f = oneGapProb * f; + *insertionPtr = oneGapProb_f + g2g * i; + d = oneGapProb_f + g2g * d; + ++foregroundPtr; + ++insertionPtr; + toForeground *= b2fGrowth; + } + + f = *foregroundPtr; + toForeground += f; + *foregroundPtr = toBackground + f2f1 * f + d; + *insertionPtr = endGapProb * f; + + backgroundProb = b2b * backgroundProb + b2fLast * toForeground; + } + + void calcForwardTransitionProbs() { + if (endGapProb > 0) return calcForwardTransitionProbsWithGaps(); + + double b = backgroundProb; + double fromForeground = 0; + double *foregroundBeg = BEG(foregroundProbs); + + for (int i = 0; i < maxRepeatOffset; ++i) { + double f = foregroundBeg[i]; + fromForeground += f; + foregroundBeg[i] = b * b2fProbs[i] + f * f2f0; + } + + backgroundProb = b * b2b + fromForeground * f2b; + } + + void calcBackwardTransitionProbs() { + if (endGapProb > 0) return calcBackwardTransitionProbsWithGaps(); + + double toBackground = f2b * backgroundProb; + double toForeground = 0; + double *foregroundBeg = BEG(foregroundProbs); + + for (int i = 0; i < maxRepeatOffset; ++i) { + double f = foregroundBeg[i]; + toForeground += b2fProbs[i] * f; + foregroundBeg[i] = toBackground + f2f0 * f; + } + + backgroundProb = b2b * backgroundProb + toForeground; + } + + void addEndCounts(double forwardProb, + double totalProb, + double *transitionCounts) { + double toEnd = forwardProb * b2b / totalProb; + transitionCounts[0] += toEnd; + } + + void addTransitionCounts(double forwardProb, + double totalProb, + double *transitionCounts) { + double toBg = forwardProb * b2b / totalProb; + double toFg = forwardProb * b2fFirst / totalProb; + + transitionCounts[0] += backgroundProb * toBg; + + for (double *i = BEG(foregroundProbs); i < END(foregroundProbs); ++i) { + ++transitionCounts; + *transitionCounts += *i * toFg; + toFg *= b2fDecay; + } + } + + bool isNearSeqBeg() { + return seqPtr - seqBeg < maxRepeatOffset; + } + + int maxOffsetInTheSequence() { + return isNearSeqBeg() ? (seqPtr - seqBeg) : maxRepeatOffset; + } + + const uchar *seqFurthestBack() { + return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset; + } + + void calcEmissionProbs() { + const double *lrRow = likelihoodRatioMatrix[*seqPtr]; + const uchar *seqStop = seqFurthestBack(); + double *foregroundPtr = BEG(foregroundProbs); + const uchar *offsetPtr = seqPtr; + + while (offsetPtr > seqStop) { + --offsetPtr; + *foregroundPtr *= lrRow[*offsetPtr]; + ++foregroundPtr; + } + + while (foregroundPtr < END(foregroundProbs)) { + *foregroundPtr *= 0; + ++foregroundPtr; + } + } + + void calcForwardTransitionAndEmissionProbs() { + if (endGapProb > 0) { + calcForwardTransitionProbsWithGaps(); + calcEmissionProbs(); + return; + } + + double b = backgroundProb; + const double *b2f = BEG(b2fProbs); + double *fp = BEG(foregroundProbs); + const double *lrRow = likelihoodRatioMatrix[*seqPtr]; + int maxOffset = maxOffsetInTheSequence(); + const uchar *sp = seqPtr; + + SimdDbl bV = simdFillDbl(b); + SimdDbl tV = simdFillDbl(f2f0); + SimdDbl sV = simdZeroDbl(); + + int i = 0; + for (; i <= maxOffset - simdDblLen; i += simdDblLen) { + SimdDbl rV = simdSetDbl( +#if defined __SSE4_1__ || defined __ARM_NEON +#ifdef __AVX2__ + lrRow[sp[-i-4]], + lrRow[sp[-i-3]], +#endif + lrRow[sp[-i-2]], +#endif + lrRow[sp[-i-1]]); + SimdDbl fV = simdLoadDbl(fp+i); + sV = simdAddDbl(sV, fV); + SimdDbl xV = simdMulDbl(bV, simdLoadDbl(b2f+i)); + simdStoreDbl(fp+i, simdMulDbl(simdAddDbl(xV, simdMulDbl(fV, tV)), rV)); + } + double fromForeground = simdHorizontalAddDbl(sV); + for (; i < maxOffset; ++i) { + double f = fp[i]; + fromForeground += f; + fp[i] = (b * b2f[i] + f * f2f0) * lrRow[sp[-i-1]]; + } + + backgroundProb = b * b2b + fromForeground * f2b; + } + + void calcEmissionAndBackwardTransitionProbs() { + if (endGapProb > 0) { + calcEmissionProbs(); + calcBackwardTransitionProbsWithGaps(); + return; + } + + double toBackground = f2b * backgroundProb; + const double *b2f = BEG(b2fProbs); + double *fp = BEG(foregroundProbs); + const double *lrRow = likelihoodRatioMatrix[*seqPtr]; + int maxOffset = maxOffsetInTheSequence(); + const uchar *sp = seqPtr; + + SimdDbl bV = simdFillDbl(toBackground); + SimdDbl tV = simdFillDbl(f2f0); + SimdDbl sV = simdZeroDbl(); + + int i = 0; + for (; i <= maxOffset - simdDblLen; i += simdDblLen) { + SimdDbl rV = simdSetDbl( +#if defined __SSE4_1__ || defined __ARM_NEON +#ifdef __AVX2__ + lrRow[sp[-i-4]], + lrRow[sp[-i-3]], +#endif + lrRow[sp[-i-2]], +#endif + lrRow[sp[-i-1]]); + SimdDbl fV = simdMulDbl(simdLoadDbl(fp+i), rV); + sV = simdAddDbl(sV, simdMulDbl(simdLoadDbl(b2f+i), fV)); + simdStoreDbl(fp+i, simdAddDbl(bV, simdMulDbl(tV, fV))); + } + double toForeground = simdHorizontalAddDbl(sV); + for (; i < maxOffset; ++i) { + double f = fp[i] * lrRow[sp[-i-1]]; + toForeground += b2f[i] * f; + fp[i] = toBackground + f2f0 * f; + } + + backgroundProb = b2b * backgroundProb + toForeground; + } + + void rescale(double scale) { + backgroundProb *= scale; + multiplyAll(foregroundProbs, scale); + multiplyAll(insertionProbs, scale); + } + + void rescaleForward() { + if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) { + assert(backgroundProb > 0); + double scale = 1 / backgroundProb; + scaleFactors[(seqPtr - seqBeg) / scaleStepSize] = scale; + rescale(scale); + } + } + + void rescaleBackward() { + if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) { + double scale = scaleFactors[(seqPtr - seqBeg) / scaleStepSize]; + rescale(scale); + } + } + + void calcRepeatProbs(float *letterProbs) { + initializeForwardAlgorithm(); + + while (seqPtr < seqEnd) { + calcForwardTransitionAndEmissionProbs(); + rescaleForward(); + *letterProbs = static_cast(backgroundProb); + ++letterProbs; + ++seqPtr; + } + + double z = forwardTotal(); + + initializeBackwardAlgorithm(); + + while (seqPtr > seqBeg) { + --seqPtr; + --letterProbs; + double nonRepeatProb = *letterProbs * backgroundProb / z; + // Convert nonRepeatProb to a float, so that it is more likely + // to be exactly 1 when it should be, e.g. for the 1st letter of + // a sequence: + *letterProbs = 1 - static_cast(nonRepeatProb); + rescaleBackward(); + calcEmissionAndBackwardTransitionProbs(); + } + + double z2 = backwardTotal(); + checkForwardAndBackwardTotals(z, z2); + } + + void countTransitions(double *transitionCounts) { + std::vector p(seqEnd - seqBeg); + float *letterProbs = BEG(p); + + initializeForwardAlgorithm(); + + while (seqPtr < seqEnd) { + *letterProbs = static_cast(backgroundProb); + calcForwardTransitionProbs(); + calcEmissionProbs(); + rescaleForward(); + ++letterProbs; + ++seqPtr; + } + + double z = forwardTotal(); + + addEndCounts(backgroundProb, z, transitionCounts); + + initializeBackwardAlgorithm(); + + while (seqPtr > seqBeg) { + --seqPtr; + --letterProbs; + rescaleBackward(); + calcEmissionProbs(); + addTransitionCounts(*letterProbs, z, transitionCounts); + calcBackwardTransitionProbs(); + } + + double z2 = backwardTotal(); + checkForwardAndBackwardTotals(z, z2); + } +}; + +void maskSequences(uchar *seqBeg, + uchar *seqEnd, + int maxRepeatOffset, + const const_double_ptr *likelihoodRatioMatrix, + double repeatProb, + double repeatEndProb, + double repeatOffsetProbDecay, + double firstGapProb, + double otherGapProb, + double minMaskProb, + const uchar *maskTable) { + std::vector p(seqEnd - seqBeg); + float *probabilities = BEG(p); + + getProbabilities(seqBeg, seqEnd, maxRepeatOffset, + likelihoodRatioMatrix, repeatProb, repeatEndProb, + repeatOffsetProbDecay, firstGapProb, otherGapProb, + probabilities); + + maskProbableLetters(seqBeg, seqEnd, probabilities, minMaskProb, maskTable); +} + +void getProbabilities(const uchar *seqBeg, + const uchar *seqEnd, + int maxRepeatOffset, + const const_double_ptr *likelihoodRatioMatrix, + double repeatProb, + double repeatEndProb, + double repeatOffsetProbDecay, + double firstGapProb, + double otherGapProb, + float *probabilities) { + Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix, + repeatProb, repeatEndProb, repeatOffsetProbDecay, + firstGapProb, otherGapProb); + tantan.calcRepeatProbs(probabilities); +} + +void maskProbableLetters(uchar *seqBeg, + uchar *seqEnd, + const float *probabilities, + double minMaskProb, + const uchar *maskTable) { + while (seqBeg < seqEnd) { + if (*probabilities >= minMaskProb) + *seqBeg = maskTable[*seqBeg]; + ++probabilities; + ++seqBeg; + } +} + +void countTransitions(const uchar *seqBeg, + const uchar *seqEnd, + int maxRepeatOffset, + const const_double_ptr *likelihoodRatioMatrix, + double repeatProb, + double repeatEndProb, + double repeatOffsetProbDecay, + double firstGapProb, + double otherGapProb, + double *transitionCounts) { + Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix, + repeatProb, repeatEndProb, repeatOffsetProbDecay, + firstGapProb, otherGapProb); + tantan.countTransitions(transitionCounts); +} + +} diff --git a/src/commons/tantan.h b/lib/tantan/tantan.h similarity index 68% rename from src/commons/tantan.h rename to lib/tantan/tantan.h index 88af9d7ef..08da49c3b 100644 --- a/src/commons/tantan.h +++ b/lib/tantan/tantan.h @@ -1,16 +1,6 @@ -// Copyright 2010 Martin C. Frith -// tantan is distributed under the GNU General Public License, either -// version 3 of the License, or (at your option) any later version. For -// details, see COPYING.txt. -// -// If you use tantan in your research, please cite: -// "A new repeat-masking method enables specific detection of homologous -// sequences", MC Frith, Nucleic Acids Research 2011 39(4):e23. -// -// tantan's website is: http://www.cbrc.jp/tantan/ -// -// If you have any questions, comments, or problems concerning tantan, -// please email: tantan (ATmark) cbrc (dot) jp. +// Author: Martin C. Frith 2010 +// SPDX-License-Identifier: MPL-2.0 + // These are routines for masking simple regions (low-complexity and // short-period tandem repeats) in biological sequences. To // understand them in detail, see the published article (in @@ -64,10 +54,11 @@ namespace tantan { +typedef unsigned char uchar; typedef const double *const_double_ptr; -int maskSequences(char *seqBeg, - char *seqEnd, +void maskSequences(uchar *seqBeg, + uchar *seqEnd, int maxRepeatOffset, const const_double_ptr *likelihoodRatioMatrix, double repeatProb, @@ -76,14 +67,14 @@ int maskSequences(char *seqBeg, double firstGapProb, double otherGapProb, double minMaskProb, - const char *maskTable); + const uchar *maskTable); // The following routine gets the posterior probability that each // letter is repetitive. It stores the results in "probabilities", // which must point to enough pre-allocated space to fit the results. -void getProbabilities(const char *seqBeg, - const char *seqEnd, +void getProbabilities(const uchar *seqBeg, + const uchar *seqEnd, int maxRepeatOffset, const const_double_ptr *likelihoodRatioMatrix, double repeatProb, @@ -96,11 +87,34 @@ void getProbabilities(const char *seqBeg, // The following routine masks each letter whose corresponding entry // in "probabilities" is >= minMaskProb. -int maskProbableLetters(char *seqBeg, - char *seqEnd, +void maskProbableLetters(uchar *seqBeg, + uchar *seqEnd, const float *probabilities, double minMaskProb, - const char *maskTable); + const uchar *maskTable); + +// The following routine counts the expected number of transitions +// from the background (non-repeat) state to other states. It adds +// the results to "transitionCounts", which must point to +// pre-initialized space for (maxRepeatOffset+1) items. The +// background->background transition count is stored in +// transitionCounts[0]. The background->(period-i repeat) transition +// count is stored in transitionCounts[i]. + +// (In this routine, the HMM begin and end states are counted as +// background states. Thus, begin->X is added to background->X, and +// X->end is added to X->background.) + +void countTransitions(const uchar *seqBeg, + const uchar *seqEnd, + int maxRepeatOffset, + const const_double_ptr *likelihoodRatioMatrix, + double repeatProb, + double repeatEndProb, + double repeatOffsetProbDecay, + double firstGapProb, + double otherGapProb, + double *transitionCounts); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a84726da5..320cd3a38 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -130,7 +130,7 @@ else () message("-- OMPTL sorting fallback") endif () -target_link_libraries(mmseqs-framework tinyexpr ${ZSTD_LIBRARIES} microtar) +target_link_libraries(mmseqs-framework tinyexpr ${ZSTD_LIBRARIES} microtar tantan) # if (CYGWIN) # target_link_libraries(mmseqs-framework nedmalloc) # endif () diff --git a/src/alignment/PSSMMasker.h b/src/alignment/PSSMMasker.h index e00007c2c..e598165c8 100644 --- a/src/alignment/PSSMMasker.h +++ b/src/alignment/PSSMMasker.h @@ -9,7 +9,7 @@ class PSSMMasker { public: PSSMMasker(size_t maxSeqLen, ProbabilityMatrix& probMatrix, BaseMatrix& subMat) : maxSeqLen(maxSeqLen), probMatrix(probMatrix), xAmioAcid(subMat.aa2num[static_cast('X')]) { - charSequence = (char*)malloc(sizeof(char) * maxSeqLen); + charSequence = (unsigned char*)malloc(sizeof(char) * maxSeqLen); } ~PSSMMasker() { @@ -19,7 +19,7 @@ class PSSMMasker { void mask(Sequence& centerSequence, float maskProb, PSSMCalculator::Profile& pssmRes) { if ((size_t)centerSequence.L > maxSeqLen) { maxSeqLen = sizeof(char) * centerSequence.L * 1.5; - charSequence = (char*)realloc(charSequence, maxSeqLen); + charSequence = (unsigned char*)realloc(charSequence, maxSeqLen); } memcpy(charSequence, centerSequence.numSequence, sizeof(unsigned char) * centerSequence.L); tantan::maskSequences(charSequence, charSequence + centerSequence.L, @@ -43,7 +43,7 @@ class PSSMMasker { } } private: - char *charSequence; + unsigned char *charSequence; size_t maxSeqLen; ProbabilityMatrix& probMatrix; const int xAmioAcid; diff --git a/src/commons/BaseMatrix.h b/src/commons/BaseMatrix.h index 7d86bdf22..4e9783799 100644 --- a/src/commons/BaseMatrix.h +++ b/src/commons/BaseMatrix.h @@ -102,7 +102,7 @@ class ProbabilityMatrix { delete[] probMatrixPointers; } - char hardMaskTable[256]; + unsigned char hardMaskTable[256]; const double **probMatrixPointers; private: diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt index dc7ab3e74..35f59b392 100644 --- a/src/commons/CMakeLists.txt +++ b/src/commons/CMakeLists.txt @@ -38,7 +38,6 @@ set(commons_header_files commons/StringBlock.h commons/SubstitutionMatrix.h commons/SubstitutionMatrixProfileStates.h - commons/tantan.h commons/TranslateNucl.h commons/Timer.h commons/UniprotKB.h @@ -72,7 +71,6 @@ set(commons_source_files commons/Sequence.cpp commons/SequenceWeights.cpp commons/SubstitutionMatrix.cpp - commons/tantan.cpp commons/UniprotKB.cpp commons/Util.cpp PARENT_SCOPE diff --git a/src/commons/tantan.cpp b/src/commons/tantan.cpp deleted file mode 100644 index e870fa58b..000000000 --- a/src/commons/tantan.cpp +++ /dev/null @@ -1,506 +0,0 @@ -// Copyright 2010 Martin C. Frith - -#include "tantan.h" - -#include // fill, max -#include -#include // pow, abs -#include // cerr -#include // accumulate -#include - -#define BEG(v) ((v).empty() ? 0 : &(v).front()) -#define END(v) ((v).empty() ? 0 : &(v).back() + 1) - -namespace tantan { - - void multiplyAll(std::vector &v, double factor) { - for (std::vector::iterator i = v.begin(); i < v.end(); ++i) - *i *= factor; - } - - double firstRepeatOffsetProb(double probMult, int maxRepeatOffset) { - if (probMult < 1 || probMult > 1) { - return (1 - probMult) / (1 - std::pow(probMult, maxRepeatOffset)); - } - return 1.0 / maxRepeatOffset; - } - - void checkForwardAndBackwardTotals(double fTot, double bTot) { - double x = std::abs(fTot); - double y = std::abs(bTot); - - // ??? Is 1e6 suitable here ??? - if (std::abs(fTot - bTot) > std::max(x, y) / 1e6) - std::cerr << "tantan: warning: possible numeric inaccuracy\n" - << "tantan: forward algorithm total: " << fTot << "\n" - << "tantan: backward algorithm total: " << bTot << "\n"; - } - - struct Tantan { - enum { scaleStepSize = 16 }; - - const char *seqBeg; // start of the sequence - const char *seqEnd; // end of the sequence - const char *seqPtr; // current position in the sequence - - int maxRepeatOffset; - - const const_double_ptr *likelihoodRatioMatrix; - - double b2b; // transition probability from background to background - double f2b; // transition probability from foreground to background - double g2g; // transition probability from gap/indel to gap/indel - //double f2g; // transition probability from foreground to gap/indel - //double g2f; // transition probability from gap/indel to foreground - double oneGapProb; // f2g * g2f - double endGapProb; // f2g * 1 - double f2f0; // foreground to foreground, if there are 0 indel transitions - double f2f1; // foreground to foreground, if there is 1 indel transition - double f2f2; // foreground to foreground, if there are 2 indel transitions - double b2fDecay; - double b2fGrowth; - double b2fFirst; // background state to first foreground state - double b2fLast; // background state to last foreground state - - double backgroundProb; - std::vector b2fProbs; // background state to each foreground state - std::vector foregroundProbs; - std::vector insertionProbs; - - std::vector scaleFactors; - - Tantan(const char *seqBeg, - const char *seqEnd, - int maxRepeatOffset, - const const_double_ptr *likelihoodRatioMatrix, - double repeatProb, - double repeatEndProb, - double repeatOffsetProbDecay, - double firstGapProb, - double otherGapProb) { - assert(maxRepeatOffset > 0); - assert(repeatProb >= 0 && repeatProb < 1); - // (if repeatProb==1, then any sequence is impossible) - assert(repeatEndProb >= 0 && repeatEndProb <= 1); - assert(repeatOffsetProbDecay > 0 && repeatOffsetProbDecay <= 1); - assert(otherGapProb >= 0 && otherGapProb <= 1); - assert(firstGapProb >= 0); - assert(repeatEndProb + firstGapProb * 2 <= 1); - - this->seqBeg = seqBeg; - this->seqEnd = seqEnd; - this->seqPtr = seqBeg; - this->maxRepeatOffset = maxRepeatOffset; - this->likelihoodRatioMatrix = likelihoodRatioMatrix; - - b2b = 1 - repeatProb; - f2b = repeatEndProb; - g2g = otherGapProb; - //f2g = firstGapProb; - //g2f = 1 - otherGapProb; - oneGapProb = firstGapProb * (1 - otherGapProb); - endGapProb = firstGapProb * (maxRepeatOffset > 1); - f2f0 = 1 - repeatEndProb; - f2f1 = 1 - repeatEndProb - firstGapProb; - f2f2 = 1 - repeatEndProb - firstGapProb * 2; - - b2fDecay = repeatOffsetProbDecay; - b2fGrowth = 1 / repeatOffsetProbDecay; - - b2fFirst = repeatProb * firstRepeatOffsetProb(b2fDecay, maxRepeatOffset); - b2fLast = repeatProb * firstRepeatOffsetProb(b2fGrowth, maxRepeatOffset); - - b2fProbs.resize(maxRepeatOffset); - foregroundProbs.resize(maxRepeatOffset); - insertionProbs.resize(maxRepeatOffset - 1); - - double p = b2fFirst; - for (int i = 0; i < maxRepeatOffset; ++i) { - b2fProbs[i] = p; - p *= b2fDecay; - } - - scaleFactors.resize((seqEnd - seqBeg) / scaleStepSize); - } - - void initializeForwardAlgorithm() { - backgroundProb = 1.0; - std::fill(foregroundProbs.begin(), foregroundProbs.end(), 0.0); - std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0); - } - - double forwardTotal() { - double fromForeground = std::accumulate(foregroundProbs.begin(), - foregroundProbs.end(), 0.0); - double total = backgroundProb * b2b + fromForeground * f2b; - assert(total > 0); - return total; - } - - void initializeBackwardAlgorithm() { - backgroundProb = b2b; - std::fill(foregroundProbs.begin(), foregroundProbs.end(), f2b); - std::fill(insertionProbs.begin(), insertionProbs.end(), 0.0); - } - - double backwardTotal() { - assert(backgroundProb > 0); - return backgroundProb; - } - - void calcForwardTransitionProbsWithGaps() { - double fromBackground = backgroundProb * b2fLast; - double *foregroundPtr = &foregroundProbs.back(); - double f = *foregroundPtr; - double fromForeground = f; - - double *insertionPtr = &insertionProbs.back(); - double i = *insertionPtr; - *foregroundPtr = fromBackground + f * f2f1 + i * endGapProb; - double d = f; - --foregroundPtr; - fromBackground *= b2fGrowth; - - while (foregroundPtr > &foregroundProbs.front()) { - f = *foregroundPtr; - fromForeground += f; - i = *(insertionPtr - 1); - *foregroundPtr = fromBackground + f * f2f2 + (i + d) * oneGapProb; - *insertionPtr = f + i * g2g; - d = f + d * g2g; - --foregroundPtr; - --insertionPtr; - fromBackground *= b2fGrowth; - } - - f = *foregroundPtr; - fromForeground += f; - *foregroundPtr = fromBackground + f * f2f1 + d * endGapProb; - *insertionPtr = f; - - backgroundProb = backgroundProb * b2b + fromForeground * f2b; - } - - void calcBackwardTransitionProbsWithGaps() { - double toBackground = f2b * backgroundProb; - double *foregroundPtr = &foregroundProbs.front(); - double f = *foregroundPtr; - double toForeground = f; - - double *insertionPtr = &insertionProbs.front(); - double i = *insertionPtr; - *foregroundPtr = toBackground + f2f1 * f + i; - double d = endGapProb * f; - ++foregroundPtr; - toForeground *= b2fGrowth; - - while (foregroundPtr < &foregroundProbs.back()) { - f = *foregroundPtr; - toForeground += f; - i = *(insertionPtr + 1); - *foregroundPtr = toBackground + f2f2 * f + (i + d); - double oneGapProb_f = oneGapProb * f; - *insertionPtr = oneGapProb_f + g2g * i; - d = oneGapProb_f + g2g * d; - ++foregroundPtr; - ++insertionPtr; - toForeground *= b2fGrowth; - } - - f = *foregroundPtr; - toForeground += f; - *foregroundPtr = toBackground + f2f1 * f + d; - *insertionPtr = endGapProb * f; - - backgroundProb = b2b * backgroundProb + b2fLast * toForeground; - } - - void calcForwardTransitionProbs() { - if (endGapProb > 0) return calcForwardTransitionProbsWithGaps(); - - double b = backgroundProb; - double fromForeground = 0; - double *foregroundBeg = BEG(foregroundProbs); - - for (int i = 0; i < maxRepeatOffset; ++i) { - double f = foregroundBeg[i]; - fromForeground += f; - foregroundBeg[i] = b * b2fProbs[i] + f * f2f0; - } - - backgroundProb = b * b2b + fromForeground * f2b; - } - - void calcBackwardTransitionProbs() { - if (endGapProb > 0) return calcBackwardTransitionProbsWithGaps(); - - double toBackground = f2b * backgroundProb; - double toForeground = 0; - double *foregroundBeg = BEG(foregroundProbs); - - for (int i = 0; i < maxRepeatOffset; ++i) { - double f = foregroundBeg[i]; - toForeground += b2fProbs[i] * f; - foregroundBeg[i] = toBackground + f2f0 * f; - } - - backgroundProb = b2b * backgroundProb + toForeground; - } - - void addEndCounts(double forwardProb, - double totalProb, - double *transitionCounts) { - double toEnd = forwardProb * b2b / totalProb; - transitionCounts[0] += toEnd; - } - - void addTransitionCounts(double forwardProb, - double totalProb, - double *transitionCounts) { - double toBg = forwardProb * b2b / totalProb; - double toFg = forwardProb * b2fFirst / totalProb; - - transitionCounts[0] += backgroundProb * toBg; - - for (double *i = BEG(foregroundProbs); i < END(foregroundProbs); ++i) { - ++transitionCounts; - *transitionCounts += *i * toFg; - toFg *= b2fDecay; - } - } - - bool isNearSeqBeg() { - return seqPtr - seqBeg < maxRepeatOffset; - } - - int maxOffsetInTheSequence() { - return isNearSeqBeg() ? (seqPtr - seqBeg) : maxRepeatOffset; - } - - const char *seqFurthestBack() { - return isNearSeqBeg() ? seqBeg : seqPtr - maxRepeatOffset; - } - - void calcEmissionProbs() { - const double *lrRow = likelihoodRatioMatrix[(int)*seqPtr]; - const char *seqStop = seqFurthestBack(); - double *foregroundPtr = BEG(foregroundProbs); - const char *offsetPtr = seqPtr; - - while (offsetPtr > seqStop) { - --offsetPtr; - *foregroundPtr *= lrRow[(int)*offsetPtr]; - ++foregroundPtr; - } - - while (foregroundPtr < END(foregroundProbs)) { - *foregroundPtr *= 0; - ++foregroundPtr; - } - } - - void calcForwardTransitionAndEmissionProbs() { - if (endGapProb > 0) { - calcForwardTransitionProbsWithGaps(); - calcEmissionProbs(); - return; - } - - double b = backgroundProb; - double fromForeground = 0; - double *foregroundBeg = BEG(foregroundProbs); - const double *lrRow = likelihoodRatioMatrix[(int)*seqPtr]; - int maxOffset = maxOffsetInTheSequence(); - - for (int i = 0; i < maxOffset; ++i) { - double f = foregroundBeg[i]; - fromForeground += f; - foregroundBeg[i] = (b * b2fProbs[i] + f * f2f0) * lrRow[(int)seqPtr[-i-1]]; - } - - backgroundProb = b * b2b + fromForeground * f2b; - } - - void calcEmissionAndBackwardTransitionProbs() { - if (endGapProb > 0) { - calcEmissionProbs(); - calcBackwardTransitionProbsWithGaps(); - return; - } - - double toBackground = f2b * backgroundProb; - double toForeground = 0; - double *foregroundBeg = BEG(foregroundProbs); - const double *lrRow = likelihoodRatioMatrix[(int)*seqPtr]; - int maxOffset = maxOffsetInTheSequence(); - - for (int i = 0; i < maxOffset; ++i) { - double f = foregroundBeg[i] * lrRow[(int)seqPtr[-i-1]]; - toForeground += b2fProbs[i] * f; - foregroundBeg[i] = toBackground + f2f0 * f; - } - - backgroundProb = b2b * backgroundProb + toForeground; - } - - void rescale(double scale) { - backgroundProb *= scale; - multiplyAll(foregroundProbs, scale); - multiplyAll(insertionProbs, scale); - } - - void rescaleForward() { - if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) { - assert(backgroundProb > 0); - double scale = 1 / backgroundProb; - scaleFactors[(seqPtr - seqBeg) / scaleStepSize] = scale; - rescale(scale); - } - } - - void rescaleBackward() { - if ((seqPtr - seqBeg) % scaleStepSize == scaleStepSize - 1) { - double scale = scaleFactors[(seqPtr - seqBeg) / scaleStepSize]; - rescale(scale); - } - } - - void calcRepeatProbs(float *letterProbs) { - initializeForwardAlgorithm(); - - while (seqPtr < seqEnd) { - calcForwardTransitionAndEmissionProbs(); - rescaleForward(); - *letterProbs = static_cast(backgroundProb); - ++letterProbs; - ++seqPtr; - } - - double z = forwardTotal(); - - initializeBackwardAlgorithm(); - - while (seqPtr > seqBeg) { - --seqPtr; - --letterProbs; - double nonRepeatProb = *letterProbs * backgroundProb / z; - // Convert nonRepeatProb to a float, so that it is more likely - // to be exactly 1 when it should be, e.g. for the 1st letter of - // a sequence: - *letterProbs = 1 - static_cast(nonRepeatProb); - rescaleBackward(); - calcEmissionAndBackwardTransitionProbs(); - } - - double z2 = backwardTotal(); - checkForwardAndBackwardTotals(z, z2); - } - - void countTransitions(double *transitionCounts) { - std::vector p(seqEnd - seqBeg); - float *letterProbs = BEG(p); - - initializeForwardAlgorithm(); - - while (seqPtr < seqEnd) { - *letterProbs = static_cast(backgroundProb); - calcForwardTransitionProbs(); - calcEmissionProbs(); - rescaleForward(); - ++letterProbs; - ++seqPtr; - } - - double z = forwardTotal(); - - addEndCounts(backgroundProb, z, transitionCounts); - - initializeBackwardAlgorithm(); - - while (seqPtr > seqBeg) { - --seqPtr; - --letterProbs; - rescaleBackward(); - calcEmissionProbs(); - addTransitionCounts(*letterProbs, z, transitionCounts); - calcBackwardTransitionProbs(); - } - - double z2 = backwardTotal(); - checkForwardAndBackwardTotals(z, z2); - } - }; - - int maskSequences(char *seqBeg, - char *seqEnd, - int maxRepeatOffset, - const const_double_ptr *likelihoodRatioMatrix, - double repeatProb, - double repeatEndProb, - double repeatOffsetProbDecay, - double firstGapProb, - double otherGapProb, - double minMaskProb, - const char *maskTable) { - std::vector p(seqEnd - seqBeg); - float *probabilities = BEG(p); - - getProbabilities(seqBeg, seqEnd, maxRepeatOffset, - likelihoodRatioMatrix, repeatProb, repeatEndProb, - repeatOffsetProbDecay, firstGapProb, otherGapProb, - probabilities); - - return maskProbableLetters(seqBeg, seqEnd, probabilities, minMaskProb, maskTable); - } - - void getProbabilities(const char *seqBeg, - const char *seqEnd, - int maxRepeatOffset, - const const_double_ptr *likelihoodRatioMatrix, - double repeatProb, - double repeatEndProb, - double repeatOffsetProbDecay, - double firstGapProb, - double otherGapProb, - float *probabilities) { - Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix, - repeatProb, repeatEndProb, repeatOffsetProbDecay, - firstGapProb, otherGapProb); - tantan.calcRepeatProbs(probabilities); - } - - int maskProbableLetters(char *seqBeg, - char *seqEnd, - const float *probabilities, - double minMaskProb, - const char *maskTable) { - int masked = 0; - while (seqBeg < seqEnd) { - if (*probabilities >= minMaskProb){ - *seqBeg = maskTable[(int)*seqBeg]; - masked++; - } - ++probabilities; - ++seqBeg; - } - return masked; - } - - void countTransitions(const char *seqBeg, - const char *seqEnd, - int maxRepeatOffset, - const const_double_ptr *likelihoodRatioMatrix, - double repeatProb, - double repeatEndProb, - double repeatOffsetProbDecay, - double firstGapProb, - double otherGapProb, - double *transitionCounts) { - Tantan tantan(seqBeg, seqEnd, maxRepeatOffset, likelihoodRatioMatrix, - repeatProb, repeatEndProb, repeatOffsetProbDecay, - firstGapProb, otherGapProb); - tantan.countTransitions(transitionCounts); - } - -} diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 3e0740c59..63ecb4c33 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -56,8 +56,8 @@ KmerPosition *initKmerPositionMemory(size_t size) { void maskSequence(int maskMode, int maskLowerCase, float maskProb, Sequence &seq, int maskLetter, ProbabilityMatrix * probMatrix){ if (maskMode == 1) { - tantan::maskSequences((char*)seq.numSequence, - (char*)(seq.numSequence + seq.L), + tantan::maskSequences((unsigned char*)seq.numSequence, + (unsigned char*)(seq.numSequence + seq.L), 50 /*options.maxCycleLength*/, probMatrix->probMatrixPointers, 0.005 /*options.repeatProb*/, diff --git a/src/prefiltering/IndexBuilder.cpp b/src/prefiltering/IndexBuilder.cpp index 120a173bb..e4e6eea9d 100644 --- a/src/prefiltering/IndexBuilder.cpp +++ b/src/prefiltering/IndexBuilder.cpp @@ -144,8 +144,9 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL } if (mask == true) { // s.print(); - maskedResidues += tantan::maskSequences((char*)s.numSequence, - (char*)(s.numSequence + s.L), + //TODO maskedResidues =; + tantan::maskSequences((unsigned char*)s.numSequence, + (unsigned char*)(s.numSequence + s.L), 50 /*options.maxCycleLength*/, probMatrix->probMatrixPointers, 0.005 /*options.repeatProb*/, diff --git a/src/util/masksequence.cpp b/src/util/masksequence.cpp index 82a468c76..8c09d2314 100644 --- a/src/util/masksequence.cpp +++ b/src/util/masksequence.cpp @@ -43,7 +43,7 @@ int masksequence(int argc, const char **argv, const Command& command) { #ifdef OPENMP thread_idx = (unsigned int) omp_get_thread_num(); #endif - char *charSequence = new char[maxSeqLen + 1]; + unsigned char *charSequence = new unsigned char[maxSeqLen + 1]; #pragma omp for schedule(dynamic, 1) for (size_t id = 0; id < reader.getSize(); ++id) { @@ -68,7 +68,7 @@ int masksequence(int argc, const char **argv, const Command& command) { char aa = seqData[pos]; charSequence[pos] = (charSequence[pos] == probMatrix.hardMaskTable[0]) ? tolower(aa) : toupper(aa); } - writer.writeData(charSequence, seqLen, reader.getDbKey(id), thread_idx); + writer.writeData((char*)charSequence, seqLen, reader.getDbKey(id), thread_idx); } delete[] charSequence; }