Skip to content

Commit

Permalink
Merge pull request #30 from IEDB/unity
Browse files Browse the repository at this point in the history
Unify Python and C++ scripts into a single C++ script
  • Loading branch information
acrinklaw authored Dec 10, 2021
2 parents 204331a + 02b507d commit 465d081
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 95 deletions.
11 changes: 3 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
- Python 3 (to generate certain output)

## Installation:
Please download TCRMatch from the latest [release](https://github.com/IEDB/TCRMatch/releases/tag/v0.1.1).
If you wish to compile from source, simply clone the repo and run
Please download TCRMatch from the latest [release](https://github.com/IEDB/TCRMatch/releases/) and compile from source by running:
```shell
make
```
Expand All @@ -30,6 +29,8 @@ ASGDRGADTQY
ASSQAGAYEQY
```

Alternatively, you may upload files using the AIRR tsv format specified [here](https://docs.airr-community.org/en/stable/datarep/rearrangements.html)

To generate the initial scores, please run
```shell
./tcrmatch -i input_file -t num_threads [-s threshold] [-d /path/to/database] [-a] > output_file
Expand All @@ -40,12 +41,6 @@ To generate the initial scores, please run
- -s optional parameter to specify the threshold (default is .97, in alignment with manuscript)
- -d optional parameter to specify where the database is located, point this to a file of newline separated CDR3b sequences to test your own private set

To generate the output format which contains information regarding epitopes, receptor groups, antigens and source organisms, navigate to the "scripts" folder and run the process_output.py file.
```shell
./process_output.py -r results_file -o output_file
```
where input_file refers to the file generated by the ./tcrmatch command, and output_file is the file you wish to store the full output to.

To update the IEDB_data.tsv file located in the data folder, navigate to the scripts directory and simply run
```
./update.sh
Expand Down
61 changes: 0 additions & 61 deletions scripts/process_output.py

This file was deleted.

135 changes: 109 additions & 26 deletions src/tcrmatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <math.h>
#include <omp.h>
#include <string>
#include <tuple>
#include <unistd.h>
#include <vector>
#include <algorithm>

std::array<std::array<float, 20>, 20> k1;
int p_kmin = 1;
Expand Down Expand Up @@ -95,9 +97,52 @@ std::vector<std::string> read_IEDB_data(std::string IEDB_data_file) {
iedb_data.push_back(sequence);
}
}

return iedb_data;
}

struct IEDB_data_row {
std::string amino_acid_sequence;
std::string original_sequence;
std::string receptor_group;
std::string epitope;
std::string organism;
std::string antigen;
};

std::map<std::string, std::vector<IEDB_data_row>>
create_IEDB_map(std::string IEDB_data_file) {
std::map<std::string, std::vector<IEDB_data_row>> iedb_map;
std::map<std::string, std::vector<IEDB_data_row>>::iterator it;
std::vector<IEDB_data_row> iedb_data;
std::ifstream iedb_file(IEDB_data_file);
IEDB_data_row input_row;
std::string line;
std::string temp;
int idx;

// read values
while (getline(iedb_file, line)) {
idx = 0;
std::stringstream buffer(line);
std::string values[6];
while (getline(buffer, temp, '\t')) {
values[idx] = temp;
idx++;
}
it = iedb_map.find(values[0]);
if (it != iedb_map.end()) {
iedb_map[values[0]].push_back(
{values[0], values[1], values[2], values[3], values[4], values[5]});
} else {
iedb_map[values[0]] = {
{values[0], values[1], values[2], values[3], values[4], values[5]}};
}
}

return iedb_map;
}

std::vector<std::string> read_AIRR_data(std::string AIRR_data_file) {
std::vector<std::string> airr_data;
std::ifstream airr_file(AIRR_data_file);
Expand All @@ -109,7 +154,7 @@ std::vector<std::string> read_AIRR_data(std::string AIRR_data_file) {
idx = 0;
std::getline(airr_file, line);
std::stringstream buffer(line);
while(getline(buffer, temp, '\t') ) {
while (getline(buffer, temp, '\t')) {
if (temp == "cdr3_aa") {
column = idx;
break;
Expand All @@ -118,15 +163,16 @@ std::vector<std::string> read_AIRR_data(std::string AIRR_data_file) {
}

if (column < 0) {
std::cerr << "Cannot find cdr3_aa column in AIRR TSV input file" << std::endl;
std::cerr << "Cannot find cdr3_aa column in AIRR TSV input file"
<< std::endl;
return airr_data;
}

// read values
while (getline(airr_file, line)) {
std::stringstream buffer(line);
std::vector<std::string> values;
while(getline(buffer, temp, '\t') ) {
while (getline(buffer, temp, '\t')) {
values.push_back(temp);
}
airr_data.push_back(values[column]);
Expand Down Expand Up @@ -200,10 +246,13 @@ float k3_sum(peptide pep1, peptide pep2) {
}

void multi_calc_k3(std::vector<peptide> peplist1, std::vector<peptide> peplist2,
float threshold) {
float threshold,
std::map<std::string, std::vector<IEDB_data_row>> iedb_map) {

// Simple method to calculate pairwise TCRMatch scores using two peptide
// vectors
std::vector<std::tuple<std::string, std::string, float>>
std::vector<std::tuple<std::string, std::string, float, int>>::iterator it2;
std::vector<std::tuple<std::string, std::string, float, int>>
results[omp_get_max_threads()];
#pragma omp parallel for
for (int i = 0; i < peplist1.size(); i++) {
Expand All @@ -214,15 +263,34 @@ void multi_calc_k3(std::vector<peptide> peplist1, std::vector<peptide> peplist2,
score = k3_sum(pep1, pep2) / sqrt(pep1.aff * pep2.aff);
if (score > threshold) {
int tid = omp_get_thread_num();
results[tid].push_back(make_tuple(pep1.seq, pep2.seq, score));
// If input seq-match seq is unique, add tuple to results
// Repeat IEDB matches are skipped (prevents duplicate rows in results) but duplicate input
// sequences are permitted (e.g. for repertoires with identical TCRs)
it2 = find(results[tid].begin(), results[tid].end(), make_tuple(pep1.seq, pep2.seq, score, i));
if (it2 == results[tid].end()) {
results[tid].push_back(make_tuple(pep1.seq, pep2.seq, score, i));
}
}
}
}
for (int i = 0; i < omp_get_max_threads(); i++) {
for (auto &tuple : results[i]) {
std::cout << std::fixed << std::setprecision(2) << std::get<0>(tuple)
<< " " << std::get<1>(tuple) << " " << std::get<2>(tuple)
<< std::endl;
std::cout << "input_sequence\tmatch_"
"sequence\tscore\treceptor_"
"group\tepitope\tantigen\torganism\t"
<< std::endl;
for (int k = 0; k < omp_get_max_threads(); k++) {
for (auto &tuple : results[k]) {
std::string match_peptide = std::get<1>(tuple);
std::map<std::string, std::vector<IEDB_data_row>>::iterator it =
iedb_map.find(match_peptide);
std::vector<IEDB_data_row> match_row_vec = it->second;
for (int l = 0; l < match_row_vec.size(); l++) {
IEDB_data_row match_row = match_row_vec[l];
std::cout << std::fixed << std::setprecision(2) << std::get<0>(tuple)
<< "\t" << std::get<1>(tuple) << "\t" << std::get<2>(tuple)
<< "\t" << match_row.receptor_group << "\t"
<< match_row.epitope << "\t" << match_row.antigen << "\t"
<< match_row.organism << std::endl;
}
}
}
}
Expand Down Expand Up @@ -267,7 +335,7 @@ int main(int argc, char *argv[]) {
return EXIT_FAILURE;
}
}

// Check that required parameters are there + error correcting
if (i_flag == -1 || t_flag == -1) {
std::cerr << "Missing mandatory parameters" << std::endl
Expand All @@ -282,15 +350,27 @@ int main(int argc, char *argv[]) {
if (thresh_flag == -1) {
threshold = .97;
}
if (t_flag == -1){
if (t_flag == -1) {
n_threads = 1;
}
if (threshold < 0 || threshold > 1) {
std::cerr << "Threshold must be between 0 and 1" << std::endl;
return EXIT_FAILURE;
}

std::map<std::string, std::vector<IEDB_data_row>> iedb_map =
create_IEDB_map(iedb_file);

std::vector<std::string> iedb_data = read_IEDB_data(iedb_file);
// Check if we didn't read any data into IEDB data
if (iedb_data.size() == 0) {
std::cout
<< "Failed to read database file - please specify the path to database "
"file if not ./data/IEDB_data.tsv"
<< std::endl;

return EXIT_FAILURE;
}
std::string line;
std::string alphabet;
std::vector<peptide> peplist1;
Expand All @@ -303,16 +383,17 @@ int main(int argc, char *argv[]) {
if (airr_flag == 1) {
// AIRR input format
std::vector<std::string> airr_data = read_AIRR_data(in_file);
if (airr_data.empty()) return EXIT_FAILURE;
if (airr_data.empty())
return EXIT_FAILURE;
for (std::vector<std::string>::iterator it = airr_data.begin();
it != airr_data.end(); it++) {
it != airr_data.end(); it++) {
std::vector<int> int_vec;
for (int i = 0; i < (*it).length(); i++) {
if (alphabet.find((*it)[i]) == -1) {
std::cerr << "Invalid amino acid found in " << *it << " at position "
<< i + 1 << std::endl;
return EXIT_FAILURE;
}
if (alphabet.find((*it)[i]) == -1) {
std::cerr << "Invalid amino acid found in " << *it << " at position "
<< i + 1 << std::endl;
return EXIT_FAILURE;
}
}
peplist1.push_back({*it, int((*it).length()), -99.9, int_vec});
}
Expand All @@ -322,11 +403,11 @@ int main(int argc, char *argv[]) {
while (getline(file1, line)) {
std::vector<int> int_vec;
for (int i = 0; i < line.length(); i++) {
if (alphabet.find(line[i]) == -1) {
std::cerr << "Invalid amino acid found in " << line << " at position "
<< i + 1 << std::endl;
return EXIT_FAILURE;
}
if (alphabet.find(line[i]) == -1) {
std::cerr << "Invalid amino acid found in " << line << " at position "
<< i + 1 << std::endl;
return EXIT_FAILURE;
}
}
peplist1.push_back({line, int(line.length()), -99.9, int_vec});
}
Expand Down Expand Up @@ -366,7 +447,9 @@ int main(int argc, char *argv[]) {
}
pep_ptr->aff = k3_sum(*pep_ptr, *pep_ptr);
}
multi_calc_k3(peplist1, peplist2, threshold);

// Calculate input data vs database with multi-threading
multi_calc_k3(peplist1, peplist2, threshold, iedb_map);

return 0;
}

0 comments on commit 465d081

Please sign in to comment.