Skip to content

Commit

Permalink
Introduce experimental a3m filtering module (hidden by default)
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Apr 11, 2023
1 parent 10e0f1b commit 167bbd1
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ extern int extractalignedregion(int argc, const char **argv, const Command& comm
extern int extractdomains(int argc, const char **argv, const Command& command);
extern int extractorfs(int argc, const char **argv, const Command& command);
extern int extractframes(int argc, const char **argv, const Command& command);
extern int filtera3m(int argc, const char **argv, const Command& command);
extern int filterdb(int argc, const char **argv, const Command& command);
extern int filterresult(int argc, const char **argv, const Command& command);
extern int gff2db(int argc, const char **argv, const Command& command);
Expand Down
7 changes: 7 additions & 0 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,13 @@ std::vector<Command> baseCommands = {
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
{"statsDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
{"filtera3m", filtera3m, &par.filtera3m, COMMAND_HIDDEN,
"DB filtering by given conditions",
"",
"Milot Mirdita <[email protected]>",
"<i:a3mFile> <o:a3mFile>",
CITATION_MMSEQS2, {{"a3mFile", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfileAndStdin },
{"a3mFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
{"filterresult", filterresult, &par.filterresult, COMMAND_RESULT,
"Pairwise alignment result filter",
NULL,
Expand Down
13 changes: 13 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,19 @@ Parameters::Parameters():
//result2msa.push_back(&PARAM_FIRST_SEQ_REP_SEQ);
result2dnamsa.push_back(&PARAM_V);
// filtera3m
filtera3m.push_back(&PARAM_SUB_MAT);
filtera3m.push_back(&PARAM_GAP_OPEN);
filtera3m.push_back(&PARAM_GAP_EXTEND);
filtera3m.push_back(&PARAM_FILTER_MIN_ENABLE);
filtera3m.push_back(&PARAM_FILTER_MAX_SEQ_ID);
filtera3m.push_back(&PARAM_FILTER_QID);
filtera3m.push_back(&PARAM_FILTER_QSC);
filtera3m.push_back(&PARAM_FILTER_COV);
filtera3m.push_back(&PARAM_FILTER_NDIFF);
filtera3m.push_back(&PARAM_V);
// filterresult
filterresult.push_back(&PARAM_SUB_MAT);
filterresult.push_back(&PARAM_GAP_OPEN);
Expand Down
1 change: 1 addition & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -1039,6 +1039,7 @@ class Parameters {
std::vector<MMseqsParameter*> result2profile;
std::vector<MMseqsParameter*> result2msa;
std::vector<MMseqsParameter*> result2dnamsa;
std::vector<MMseqsParameter*> filtera3m;
std::vector<MMseqsParameter*> filterresult;
std::vector<MMseqsParameter*> convertmsa;
std::vector<MMseqsParameter*> msa2profile;
Expand Down
1 change: 1 addition & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ set(util_source_files
util/extractorfs.cpp
util/orftocontig.cpp
util/touchdb.cpp
util/filtera3m.cpp
util/filterdb.cpp
util/gff2db.cpp
util/renamedbkeys.cpp
Expand Down
104 changes: 104 additions & 0 deletions src/util/filtera3m.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include "Debug.h"
#include "Util.h"
#include "FileUtil.h"
#include "KSeqWrapper.h"
#include "MsaFilter.h"
#include "MultipleAlignment.h"

MultipleAlignment::MSAResult readMSA(const std::string &a3mPath) {
KSeqWrapper* kseq = KSeqFactory(a3mPath.c_str());
MultipleAlignment::MSAResult msa(0, 0, 0, NULL);
msa.msaSequenceLength = 0;
msa.centerLength = 0;
msa.setSize = 0;
std::vector<std::string> sequences;
std::string seq;
Debug::Progress progress;
while (kseq->ReadEntry()) {
progress.updateProgress();
const KSeqWrapper::KSeqEntry &e = kseq->entry;
seq.reserve(e.sequence.l);
if (msa.setSize == 0) {
msa.msaSequenceLength = e.sequence.l;
msa.centerLength = e.sequence.l;
}

for (size_t i = 0; i < e.sequence.l; i++) {
char c = e.sequence.s[i];
if (!(c >= 'a' && c <= 'z')) {
seq.push_back(c);
}
}
sequences.push_back(seq);
seq.clear();
msa.setSize++;
}
msa.msaSequence = new char*[msa.setSize];
for (size_t i = 0; i < msa.setSize; i++) {
msa.msaSequence[i] = &(sequences[i])[0];
}
delete kseq;
return msa;
}

int filtera3m(int argc, const char **argv, const Command& command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

std::vector<std::string> qid_str_vec = Util::split(par.qid, ",");
std::vector<int> qid_vec;
for (size_t qid_idx = 0; qid_idx < qid_str_vec.size(); qid_idx++) {
float qid_float = strtod(qid_str_vec[qid_idx].c_str(), NULL);
qid_vec.push_back(static_cast<int>(qid_float*100));
}
std::sort(qid_vec.begin(), qid_vec.end());

MultipleAlignment::MSAResult msa = readMSA(par.db1.c_str());
FILE* handle = FileUtil::openFileOrDie(par.db2.c_str(), "w", false);

SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0f, -0.2f);
MsaFilter filter(msa.centerLength + 1, msa.setSize, &subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid());
filter.filter(
msa.setSize,
msa.centerLength,
static_cast<int>(par.covMSAThr * 100),
qid_vec,
par.qsc,
static_cast<int>(par.filterMaxSeqId * 100),
par.Ndiff,
par.filterMinEnable,
(const char **) msa.msaSequence,
false
);
bool* kept = new bool[msa.setSize];
filter.getKept(kept, msa.setSize);

KSeqWrapper* kseq2 = KSeqFactory(par.db1.c_str());
size_t i = 0;
while (kseq2->ReadEntry()) {
if (kept[i] == false) {
i++;
continue;
}
fwrite(">", sizeof(char), 1, handle);
fwrite(kseq2->entry.name.s, sizeof(char), kseq2->entry.name.l, handle);
if (kseq2->entry.comment.l > 0) {
fwrite(" ", sizeof(char), 1, handle);
fwrite(kseq2->entry.comment.s, sizeof(char), kseq2->entry.comment.l, handle);
}
fwrite("\n", sizeof(char), 1, handle);
fwrite(kseq2->entry.sequence.s, sizeof(char), kseq2->entry.sequence.l, handle);
fwrite("\n", sizeof(char), 1, handle);
i++;
}
if (fclose(handle) != 0) {
Debug(Debug::ERROR) << "Cannot close file " << par.db2 << "\n";
return EXIT_FAILURE;
}

delete kseq2;
delete[] kept;
delete[] msa.msaSequence;

return EXIT_SUCCESS;
}

0 comments on commit 167bbd1

Please sign in to comment.