Skip to content

Commit

Permalink
Merge commit '56c5806585c479863b730d3d4ddff4a429b8cbba'
Browse files Browse the repository at this point in the history
  • Loading branch information
elileka committed Sep 4, 2023
2 parents e8ef9c1 + 56c5806 commit 6bd769f
Show file tree
Hide file tree
Showing 35 changed files with 694 additions and 213 deletions.
6 changes: 3 additions & 3 deletions lib/mmseqs/data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ case "${SELECTION}" in
;;
"GTDB")
if notExists "${TMP_PATH}/download.done"; then
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION" "${TMP_PATH}/version"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/VERSION.txt" "${TMP_PATH}/version"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz" "${TMP_PATH}/gtdb.tar.gz"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_taxonomy.tsv" "${TMP_PATH}/bac120_taxonomy.tsv"
downloadFile "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_taxonomy.tsv" "${TMP_PATH}/ar53_taxonomy.tsv"
Expand Down Expand Up @@ -371,9 +371,9 @@ case "${INPUT_TYPE}" in
;;
"GTDB")
# shellcheck disable=SC2086
"${MMSEQS}" tar2db "${TMP_PATH}/gtdb.tar.gz" "${TMP_PATH}/tardb" --tar-include 'faa$' ${THREADS_PAR} \
"${MMSEQS}" tar2db "${TMP_PATH}/gtdb.tar.gz" "${TMP_PATH}/tardb" --tar-include 'faa.gz$' ${THREADS_PAR} \
|| fail "tar2db died"
sed 's|_protein\.faa||g' "${TMP_PATH}/tardb.lookup" > "${TMP_PATH}/tardb.lookup.tmp"
sed 's|_protein\.faa\.gz||g' "${TMP_PATH}/tardb.lookup" > "${TMP_PATH}/tardb.lookup.tmp"
mv -f -- "${TMP_PATH}/tardb.lookup.tmp" "${TMP_PATH}/tardb.lookup"
# shellcheck disable=SC2086
"${MMSEQS}" createdb "${TMP_PATH}/tardb" "${OUTDB}" ${COMP_PAR} \
Expand Down
19 changes: 11 additions & 8 deletions lib/mmseqs/lib/ksw2/kseq.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>

#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define KS_SEP_TAB 1 // isspace() && !' '
Expand All @@ -39,8 +40,9 @@

#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
char *buf; \
int begin, end, is_eof; \
char *buf; \
int64_t begin, end; \
int is_eof; \
size_t cur_buf_pos; \
size_t newline; \
type_t f; \
Expand Down Expand Up @@ -94,13 +96,13 @@ typedef struct __kstring_t {
#endif

#define __KS_GETUNTIL(__read, __bufsize) \
static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
static int64_t ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
{ \
int gotany = 0; \
if (dret) *dret = 0; \
str->l = append? str->l : 0; \
for (;;) { \
int i; \
int64_t i; \
if (ks_err(ks)) return -3; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
Expand Down Expand Up @@ -146,7 +148,7 @@ typedef struct __kstring_t {
str->s[str->l] = '\0'; \
return str->l; \
} \
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
static inline int64_t ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }

#define KSTREAM_INIT(type_t, __read, __bufsize) \
Expand Down Expand Up @@ -182,9 +184,10 @@ typedef struct __kstring_t {
-3 error reading stream
*/
#define __KSEQ_READ(SCOPE) \
SCOPE int kseq_read(kseq_t *seq) \
SCOPE int64_t kseq_read(kseq_t *seq) \
{ \
int c,r; \
int c; \
int64_t r; \
kstream_t *ks = seq->f; \
ks->newline = 0; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
Expand Down Expand Up @@ -255,6 +258,6 @@ typedef struct __kstring_t {
__KSEQ_TYPE(type_t) \
extern kseq_t *kseq_init(type_t fd); \
void kseq_destroy(kseq_t *ks); \
int kseq_read(kseq_t *seq);
int64_t kseq_read(kseq_t *seq);

#endif
3 changes: 3 additions & 0 deletions lib/mmseqs/src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ extern int mergedbs(int argc, const char **argv, const Command& command);
extern int mergeresultsbyset(int argc, const char **argv, const Command &command);
extern int msa2profile(int argc, const char **argv, const Command& command);
extern int sequence2profile(int argc, const char **argv, const Command& command);
extern int createclusearchdb(int argc, const char **argv, const Command& command);
extern int msa2result(int argc, const char **argv, const Command& command);
extern int multihitdb(int argc, const char **argv, const Command& command);
extern int multihitsearch(int argc, const char **argv, const Command& command);
Expand All @@ -93,6 +94,7 @@ extern int profile2repseq(int argc, const char **argv, const Command& command);
extern int proteinaln2nucl(int argc, const char **argv, const Command& command);
extern int rescorediagonal(int argc, const char **argv, const Command& command);
extern int ungappedprefilter(int argc, const char **argv, const Command& command);
extern int gappedprefilter(int argc, const char **argv, const Command& command);
extern int unpackdb(int argc, const char **argv, const Command& command);
extern int rbh(int argc, const char **argv, const Command& command);
extern int result2flat(int argc, const char **argv, const Command& command);
Expand All @@ -107,6 +109,7 @@ extern int search(int argc, const char **argv, const Command& command);
extern int linsearch(int argc, const char **argv, const Command& command);
extern int sortresult(int argc, const char **argv, const Command& command);
extern int splitdb(int argc, const char **argv, const Command& command);
extern int setextendeddbtype(int argc, const char **argv, const Command& command);
extern int splitsequence(int argc, const char **argv, const Command& command);
extern int subtractdbs(int argc, const char **argv, const Command& command);
extern int suffixid(int argc, const char **argv, const Command& command);
Expand Down
28 changes: 23 additions & 5 deletions lib/mmseqs/src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,14 @@ std::vector<Command> baseCommands = {
CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
{"gappedprefilter", gappedprefilter, &par.gappedprefilter, COMMAND_PREFILTER,
"Optimal Smith-Waterman-based prefiltering (slow)",
NULL,
"Martin Steinegger <[email protected]>",
"<i:queryDB> <i:targetDB> <o:prefilterDB>",
CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
{"kmermatcher", kmermatcher, &par.kmermatcher, COMMAND_PREFILTER,
"Find bottom-m-hashed k-mer matches within sequence DB",
NULL,
Expand Down Expand Up @@ -793,8 +801,13 @@ std::vector<Command> baseCommands = {
CITATION_MMSEQS2, {{"resultDBLeft", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"resultDBRight", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},


{"setextendeddbtype", setextendeddbtype, &par.extendeddbtype, COMMAND_DB,
"Write an extended DB ",
"# Print entries with keys 1, 2 and 3 from a sequence DB to stdout\n"
"mmseqs setextendedbtype db --extended-dbtype 2\n",
"Martin Steinegger <[email protected]>",
"<i:DB>",
CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},

{"view", view, &par.view, COMMAND_DB,
"Print DB entries given in --id-list to stdout",
Expand Down Expand Up @@ -1245,9 +1258,14 @@ std::vector<Command> baseCommands = {
"Martin Steinegger <[email protected]>",
"<i:sequenceDB> ",
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},



{"createclusearchdb", createclusearchdb, &par.createclusearchdb, COMMAND_HIDDEN,
"Separates a sequence DB into a representative and a non-representative DB",
NULL,
"Martin Steinegger <[email protected]>",
"<i:sequenceDB> <i:resultDB> <o:sequenceDB>",
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
{"dbtype", dbtype, &par.empty, COMMAND_HIDDEN,
"",
NULL,
Expand Down
9 changes: 7 additions & 2 deletions lib/mmseqs/src/alignment/BandedNucleotideAligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,15 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,
int queryLen = querySeqObj->L;
int origQueryLen = queryLen;
if (wrappedScoring) {
alignment = DistanceCalculator::computeUngappedWrappedAlignment(
if (querySeqObj->L >= targetSeqObj->L * 2)
alignment = DistanceCalculator::computeUngappedWrappedAlignment(
queryCharSeqAlign, querySeqObj->L, targetSeqObj->getSeqData(), targetSeqObj->L,
diagonal, fastMatrix.matrix, Parameters::RESCORE_MODE_ALIGNMENT);
origQueryLen = queryLen/2;
else
alignment = DistanceCalculator::computeUngappedAlignment(
queryCharSeqAlign, querySeqObj->L / 2, targetSeqObj->getSeqData(), targetSeqObj->L,
diagonal, fastMatrix.matrix, Parameters::RESCORE_MODE_ALIGNMENT);
origQueryLen = queryLen / 2;
}
else {
alignment = DistanceCalculator::computeUngappedAlignment(
Expand Down
3 changes: 2 additions & 1 deletion lib/mmseqs/src/alignment/MultipleAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,9 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente
unsigned int queryPos = result.qStartPos;
unsigned int targetPos = result.dbStartPos;
// HACK: score was 0 and sequence was rejected, so we fill in an empty gap sequence
// Needed for pairaln with dummy sequences
if(targetPos == UINT_MAX) {
Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n";
//Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n";
// fill up with gaps
for(size_t pos = 0; pos < centerSeqSize; pos++){
edgeSeqMSA[pos] = '-';
Expand Down
20 changes: 14 additions & 6 deletions lib/mmseqs/src/alignment/rescorediagonal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,15 +213,23 @@ int doRescorediagonal(Parameters &par,
}
DistanceCalculator::LocalAlignment alignment;
if (par.wrappedScoring) {
/*
if (dbLen > origQueryLen) {
Debug(Debug::WARNING) << "WARNING: target sequence " << targetId
<< " is skipped, no valid wrapped scoring possible\n";
continue;
}

alignment = DistanceCalculator::computeUngappedWrappedAlignment(
Debug(Debug::WARNING) << "WARNING: target sequence " << targetId
<< " is skipped, no valid wrapped scoring possible\n";
continue;
}
*/
if (dbLen <= origQueryLen) {
alignment = DistanceCalculator::computeUngappedWrappedAlignment(
querySeqToAlign, queryLen, targetSeq, targetLength,
results[entryIdx].diagonal, fastMatrix.matrix, par.rescoreMode);
}
else{
alignment = DistanceCalculator::computeUngappedAlignment(
querySeqToAlign, origQueryLen, targetSeq, targetLength,
results[entryIdx].diagonal, fastMatrix.matrix, par.rescoreMode);
}
}
else {
alignment = DistanceCalculator::computeUngappedAlignment(
Expand Down
82 changes: 44 additions & 38 deletions lib/mmseqs/src/clustering/ClusteringAlgorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,66 +269,72 @@ void ClusteringAlgorithms::setCover(unsigned int **elementLookupTable, unsigned
}

void ClusteringAlgorithms::greedyIncrementalLowMem( unsigned int *assignedcluster) {
// two step clustering
// 1.) we define the rep. sequences by minimizing the ids (smaller ID = longer sequence)
// 2.) we correct maybe wrong assigned sequence by checking if the assigned sequence is really a rep. seq.
// if they are not make them rep. seq.
#pragma omp parallel
{
int thread_idx = 0;
#ifdef OPENMP
thread_idx = omp_get_thread_num();
#endif
#pragma omp for schedule(dynamic, 1000)
for(size_t i = 0; i < dbSize; i++) {
unsigned int clusterKey = seqDbr->getDbKey(i);
unsigned int clusterId = i;

// try to set your self as cluster centriod
// if some other cluster covered
unsigned int targetId;
__atomic_load(&assignedcluster[clusterId], &targetId ,__ATOMIC_RELAXED);
do {
if (targetId <= clusterId) break;
} while (!__atomic_compare_exchange(&assignedcluster[clusterId], &targetId, &clusterId , false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
const long BUFFER_SIZE = 100000; // Set this to a suitable value.
const long numBuffers = (dbSize + BUFFER_SIZE - 1) / BUFFER_SIZE;

// Pre-allocate buffer outside the loop to reuse it
std::vector<std::pair<unsigned int, std::vector<unsigned int>>> buffer(BUFFER_SIZE);

for (long bufferIndex = 0; bufferIndex < numBuffers; bufferIndex++) {
long start = bufferIndex * BUFFER_SIZE;
long end = std::min(start + BUFFER_SIZE, static_cast<long>(dbSize));

// Clear the vectors within the buffer, but don't deallocate
for (std::pair<unsigned int, std::vector<unsigned int>>& entry : buffer) {
entry.second.clear();
}

// Parallel reading and parsing into buffer
#pragma omp parallel for schedule(dynamic, 4)
for (long i = start; i < end; i++) {
unsigned int clusterKey = seqDbr->getDbKey(i);
const size_t alnId = alnDbr->getId(clusterKey);
char *data = alnDbr->getData(alnId, thread_idx);
char* data = alnDbr->getData(alnId, 0);

std::vector<unsigned int>& keys = buffer[i - start].second;
while (*data != '\0') {
char dbKey[255 + 1];
Util::parseKey(data, dbKey);
const unsigned int key = (unsigned int) strtoul(dbKey, NULL, 10);
const unsigned int key = (unsigned int)strtoul(dbKey, NULL, 10);
keys.push_back(key);
data = Util::skipLine(data);
}

unsigned int currElement = seqDbr->getId(key);
unsigned int targetId;
buffer[i - start].first = i;
}

__atomic_load(&assignedcluster[currElement], &targetId ,__ATOMIC_RELAXED);
do {
if (targetId <= clusterId) break;
} while (!__atomic_compare_exchange(&assignedcluster[currElement], &targetId, &clusterId , false, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
// Sequential processing of the buffer
for (long j = 0; j < (end - start); j++) {
unsigned int clusterId = buffer[j].first;
const std::vector<unsigned int>& keys = buffer[j].second;

if (currElement == UINT_MAX || currElement > seqDbr->getSize()) {
Debug(Debug::ERROR) << "Element " << dbKey
<< " contained in some alignment list, but not contained in the sequence database!\n";
EXIT(EXIT_FAILURE);
if (assignedcluster[clusterId] != UINT_MAX) {
continue;
}

if (keys.size() <= 1) {
continue;
}

for (unsigned int key : keys) {
unsigned int currElement = seqDbr->getId(key);

if (assignedcluster[currElement] == UINT_MAX) {
assignedcluster[currElement] = clusterId;
}
data = Util::skipLine(data);
}
}
}

// correct edges that are not assigned properly
for (size_t id = 0; id < dbSize; ++id) {
unsigned int assignedClusterId = assignedcluster[id];
// check if the assigned clusterid is a rep. sequence
// if not, make it a rep. seq. again
if (assignedcluster[assignedClusterId] != assignedClusterId){
assignedcluster[assignedClusterId] = assignedClusterId;
if(assignedcluster[id] == UINT_MAX){
assignedcluster[id] = id;
}
}

}

void ClusteringAlgorithms::readInClusterData(unsigned int **elementLookupTable, unsigned int *&elements,
Expand Down
1 change: 1 addition & 0 deletions lib/mmseqs/src/commons/Debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdlib.h>
#include <cstddef>
#include <sys/stat.h>
#include <cstdint>

class TtyCheck {
public:
Expand Down
24 changes: 22 additions & 2 deletions lib/mmseqs/src/commons/IndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,28 @@ class IndexReader {
}

if (sequenceReader == NULL) {
if ((databaseType & USER_SELECT) == false && databaseType & (HEADERS | SRC_HEADERS)) {
failSuffix = "_h";
if((databaseType & USER_SELECT) == false && failSuffix == "" ){
if (databaseType & HEADERS) {
failSuffix = "_h";
} else if (databaseType & SRC_HEADERS) {
failSuffix = "_seq_h";
if(FileUtil::fileExists((dataName + "_seq_h.dbtype").c_str())==false){
failSuffix = "_h";
}
} else if (databaseType & SRC_SEQUENCES) {
failSuffix = "_seq";
if(FileUtil::fileExists((dataName + "_seq.dbtype").c_str())==false){
failSuffix = "";
}
} else if (databaseType & SEQUENCES) {
failSuffix = "";
} else if (databaseType & ALIGNMENTS) {
if(FileUtil::fileExists((dataName + "_clu.dbtype").c_str())){
failSuffix = "_clu";
}else{
failSuffix = "_aln";
}
}
}
sequenceReader = new DBReader<unsigned int>(
(dataName + failSuffix).c_str(), (dataName + failSuffix + ".index").c_str(),
Expand Down
Loading

0 comments on commit 6bd769f

Please sign in to comment.