Skip to content

Commit

Permalink
Adding new feature that allows for multiple alignments + tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nandsra21 committed Mar 29, 2024
1 parent 38c859d commit 5805c58
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 31 deletions.
4 changes: 2 additions & 2 deletions src/pathogen_embed/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def autoOrFloat(values):
def make_parser_embed():
parser = argparse.ArgumentParser(description = "Reduced dimension embeddings for pathogen sequences", formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("--alignment", required = True, help="an aligned FASTA file to create a distance matrix with. Make sure the strain order in this file matches the order in the distance matrix.")
parser.add_argument("--distance-matrix", help="a distance matrix that can be read in by pandas, index column as row 0")
parser.add_argument("--alignment", nargs="+", required = True, help="an aligned FASTA file (or files) to create a distance matrix with. Make sure the strain order in this file matches the order in the distance matrix. If adding more than one alignment, make sure the order of the strains and strain names are the same between all the files.")
parser.add_argument("--distance-matrix", nargs="+", help="a distance matrix (or matrices) that can be read in by pandas, index column as row 0. If adding more than one distance matrix, make sure the order of the strains and strain names in the header are the same between all the files.")
parser.add_argument("--separator", default=",", help="separator between columns in the given distance matrix")
parser.add_argument("--indel-distance", action="store_true", help="include insertions/deletions in genetic distance calculations")
parser.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
Expand Down
111 changes: 82 additions & 29 deletions src/pathogen_embed/pathogen_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,22 @@ def get_hamming_distances(genomes, count_indels=False):

return hamming_distances

def concat_distance_matrices(matrices):
# TODO: add error checking?
result = matrices[0]

result_df = pd.read_csv(result, header=None)

result_arr = result_df.values.astype(float)

# Add the numpy arrays element-wise
for matrix in matrices[1:]:
other_df = pd.read_csv(matrix, header=None)
other_arr = other_df.values.astype(float)

result_arr = result_arr + other_arr

return result_arr

def embed(args):
# Setting Random seed for numpy
Expand All @@ -197,26 +213,45 @@ def embed(args):
print("You must specify an alignment for pca, not a distance matrix", file=sys.stderr)
sys.exit(1)

if args.alignment is not None and args.distance_matrix is not None and len(args.alignment) != len(args.distance_matrix):

This comment has been minimized.

Copy link
@huddlej

huddlej Apr 9, 2024

Contributor

The length tests at the end of this line are comparing the length of the file names and not the dimensions of the two datasets.

print("If giving multiple alignments and distance matrices the number of both must match", file=sys.stderr)
sys.exit(1)

# getting or creating the distance matrix
distance_matrix = None
if args.distance_matrix is not None:
distance_matrix = pd.read_csv(args.distance_matrix, index_col=0)

if args.alignment is not None:
sequences_by_name = OrderedDict()

for sequence in Bio.SeqIO.parse(args.alignment, "fasta"):
sequences_by_name[sequence.id] = str(sequence.seq)

sequence_names = list(sequences_by_name.keys())
if args.command != "pca" and distance_matrix is None:
# Calculate Distance Matrix
hamming_distances = get_hamming_distances(
list(sequences_by_name.values()),
args.indel_distance,
)
distance_matrix = pd.DataFrame(squareform(hamming_distances))
distance_matrix.index = sequence_names
if args.distance_matrix is not None and args.command != "pca":
distance_matrix = concat_distance_matrices(args.distance_matrix)

# if alignments and no distance matrices
sequence_names = None
if args.alignment is not None and distance_matrix is None:

for alignment in args.alignment:
curr_matrix = None
# TODO: check the concatenation logic
if (distance_matrix is not None):
curr_matrix = distance_matrix

sequences_by_name = OrderedDict()

for sequence in Bio.SeqIO.parse(alignment, "fasta"):
sequences_by_name[sequence.id] = str(sequence.seq)

if (sequence_names is not None):
if (sequence_names != list(sequences_by_name.keys)):
print("The strains in the multiple alignments must match for concatenating them", file=sys.stderr)
sys.exit(1)

sequence_names = list(sequences_by_name.keys())
if args.command != "pca" and distance_matrix is None:
# Calculate Distance Matrix
hamming_distances = get_hamming_distances(
list(sequences_by_name.values()),
args.indel_distance,
)
distance_matrix = pd.DataFrame(squareform(hamming_distances))
distance_matrix.index = sequence_names
distance_matrix = curr_matrix + distance_matrix

# Load embedding parameters from an external CSV file, if possible.
external_embedding_parameters = None
Expand All @@ -235,17 +270,35 @@ def embed(args):

# Use PCA as its own embedding or as an initialization for t-SNE.
if args.command == "pca" or args.command == "t-sne":
sequence_names = list(sequences_by_name.keys())

numbers = list(sequences_by_name.values())[:]
for i in range(0,len(list(sequences_by_name.values()))):
numbers[i] = re.sub(r'[^AGCT]', '5', numbers[i])
numbers[i] = list(numbers[i].replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
numbers[i] = [int(j) for j in numbers[i]]

genomes_df = pd.DataFrame(numbers)
genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]

genomes_df = None
sequence_names = None
for alignment in args.alignment:

sequences_by_name = OrderedDict()

for sequence in Bio.SeqIO.parse(alignment, "fasta"):
sequences_by_name[sequence.id] = str(sequence.seq)

if (sequence_names is not None):
if (sequence_names != list(sequences_by_name.keys)):
print("The strains in the multiple alignments must match for concatenating them", file=sys.stderr)
sys.exit(1)

sequence_names = list(sequences_by_name.keys())

numbers = list(sequences_by_name.values())[:]
for i in range(0,len(list(sequences_by_name.values()))):
numbers[i] = re.sub(r'[^AGCT]', '5', numbers[i])
numbers[i] = list(numbers[i].replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
numbers[i] = [int(j) for j in numbers[i]]

if genomes_df is None:
genomes_df = pd.DataFrame(numbers)
genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
else:
second_df = pd.DataFrame(numbers)
second_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
genomes_df = genomes_df.add(second_df)

#performing PCA on my pandas dataframe
pca = PCA(
Expand Down
17 changes: 17 additions & 0 deletions tests/pathogen-embed-multiple-alignment-mds.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Run pathogen-embed with MDS on a H3N2 HA and H3N2 NA alignments.

$ pathogen-embed \
> --alignment $TESTDIR/data/h3n2_ha_alignment.fasta $TESTDIR/data/h3n2_na_alignment.fasta \
> --output-dataframe embed_mds.csv \
> mds \
> --components 2 \
> --stress embed_mds_stress.csv

There should be one record in the embedding per input sequence in the alignment.

$ [[ $(sed 1d embed_mds.csv | wc -l) == $(grep "^>" $TESTDIR/data/h3n2_ha_alignment.fasta | wc -l) ]]

There should be 1 entry for the stress of the MDS embedding.

$ wc -l < embed_mds_stress.csv
\s*1 (re)
17 changes: 17 additions & 0 deletions tests/pathogen-embed-multiple-alignment-pca.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Run pathogen-embed with PCA on a H3N2 HA and H3N2 NA alignments.

$ pathogen-embed \
> --alignment $TESTDIR/data/h3n2_ha_alignment.fasta $TESTDIR/data/h3n2_na_alignment.fasta \
> --output-dataframe embed_pca.csv \
> pca \
> --components 2 \
> --explained-variance embed_pca_variance.csv

There should be one record in the embedding per input sequence in the alignment.

$ [[ $(sed 1d embed_pca.csv | wc -l) == $(grep "^>" $TESTDIR/data/h3n2_ha_alignment.fasta | wc -l) ]]

There should be 2 components of variance explained.

$ sed 1d embed_pca_variance.csv | wc -l
\s*2 (re)

0 comments on commit 5805c58

Please sign in to comment.