Skip to content

Commit

Permalink
Adding FIRST draft of changes for multiple alignment enhancement - ha…
Browse files Browse the repository at this point in the history
…s not been exhaustively tested yet
  • Loading branch information
nandsra21 committed Apr 9, 2024
1 parent b36a39c commit 05b2d03
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 31 deletions.
4 changes: 2 additions & 2 deletions src/pathogen_embed/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def autoOrFloat(values):
def make_parser_embed():
parser = argparse.ArgumentParser(description = "Reduced dimension embeddings for pathogen sequences", formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("--alignment", nargs="+", required = True, help="an aligned FASTA file (or files) to create a distance matrix with. Make sure the strain order in this file matches the order in the distance matrix. If adding more than one alignment, make sure the order of the strains and strain names are the same between all the files.")
parser.add_argument("--distance-matrix", nargs="+", help="a distance matrix (or matrices) that can be read in by pandas, index column as row 0. If adding more than one distance matrix, make sure the order of the strains and strain names in the header are the same between all the files.")
parser.add_argument("--alignment", nargs='+', required = True, help="an aligned FASTA file (or files) to create a distance matrix with. Make sure the strain order in this file matches the order in the distance matrix. If adding more than one alignment, make sure the order of the strains and strain names are the same between all the files.")
parser.add_argument("--distance-matrix", nargs='+', help="a distance matrix (or matrices) that can be read in by pandas, index column as row 0. If adding more than one distance matrix, make sure the order of the strains and strain names in the header are the same between all the files.")
parser.add_argument("--separator", default=",", help="separator between columns in the given distance matrix")
parser.add_argument("--indel-distance", action="store_true", help="include insertions/deletions in genetic distance calculations")
parser.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
Expand Down
92 changes: 63 additions & 29 deletions src/pathogen_embed/pathogen_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def get_hamming_distances(genomes, count_indels=False):
>>> genomes = ["ACTGG", "A--GN", "A-NGG"]
>>> get_hamming_distances(genomes, True)
[1, 1, 1]
f

This comment has been minimized.

Copy link
@huddlej

huddlej Apr 9, 2024

Contributor

Typo here

When counting indels, we ignore leading and trailing gaps that indicate
different sequence lengths and not biological events.
Expand Down Expand Up @@ -188,18 +188,18 @@ def concat_distance_matrices(matrices):
# TODO: add error checking?
result = matrices[0]

result_df = pd.read_csv(result, header=None)
result_df = pd.read_csv(result, index_col=0)

result_arr = result_df.values.astype(float)

# Add the numpy arrays element-wise
for matrix in matrices[1:]:
other_df = pd.read_csv(matrix, header=None)
other_df = pd.read_csv(matrix, index_col=0)
other_arr = other_df.values.astype(float)

result_arr = result_arr + other_arr

return result_arr
return pd.DataFrame(result_arr)

def embed(args):
# Setting Random seed for numpy
Expand All @@ -214,44 +214,77 @@ def embed(args):
sys.exit(1)

if args.alignment is not None and args.distance_matrix is not None and len(args.alignment) != len(args.distance_matrix):
print(len(args.alignment))
print(len(args.distance_matrix))
print("If giving multiple alignments and distance matrices the number of both must match", file=sys.stderr)
sys.exit(1)

set_index = False
# getting or creating the distance matrix
distance_matrix = None
if args.distance_matrix is not None and args.command != "pca":
distance_matrix = concat_distance_matrices(args.distance_matrix)
set_index = True

# if alignments and no distance matrices
sequence_names = None
if args.alignment is not None and distance_matrix is None:
# if args.alignment is not None and distance_matrix is None:

for alignment in args.alignment:
curr_matrix = None
# TODO: check the concatenation logic
if (distance_matrix is not None):
curr_matrix = distance_matrix
# for alignment in args.alignment:
# curr_matrix = None
# if (distance_matrix is not None):
# curr_matrix = distance_matrix

# sequences_by_name = OrderedDict()

# for sequence in Bio.SeqIO.parse(alignment, "fasta"):
# sequences_by_name[sequence.id] = str(sequence.seq)

# if (sequence_names is not None):
# if (sequence_names != list(sequences_by_name.keys)):
# print("The strains in the multiple alignments must match for concatenating them", file=sys.stderr)
# sys.exit(1)

# sequence_names = list(sequences_by_name.keys())
# if args.command != "pca" and distance_matrix is None:
# # Calculate Distance Matrix
# hamming_distances = get_hamming_distances(
# list(sequences_by_name.values()),
# args.indel_distance,
# )
# distance_matrix = pd.DataFrame(squareform(hamming_distances))
# distance_matrix.index = sequence_names
# distance_matrix = curr_matrix + distance_matrix


if args.alignment is not None and distance_matrix is None:
for alignment in args.alignment:
sequences_by_name = OrderedDict()

for sequence in Bio.SeqIO.parse(alignment, "fasta"):
sequences_by_name[sequence.id] = str(sequence.seq)

if (sequence_names is not None):
if (sequence_names != list(sequences_by_name.keys)):
print("The strains in the multiple alignments must match for concatenating them", file=sys.stderr)
sys.exit(1)
seq_sorted = sorted(list(sequences_by_name.keys()))
if sequence_names is not None and sorted(sequence_names) != seq_sorted:
print("The strains in the multiple alignments must match before concatenating them", file=sys.stderr)
sys.exit(1)

sequence_names = list(sequences_by_name.keys())
if args.command != "pca" and distance_matrix is None:
# Calculate Distance Matrix
hamming_distances = get_hamming_distances(
list(sequences_by_name.values()),
args.indel_distance,
)
distance_matrix = pd.DataFrame(squareform(hamming_distances))
distance_matrix.index = sequence_names
distance_matrix = curr_matrix + distance_matrix
sequence_names = seq_sorted #list(sequences_by_name.keys())

# assert sequence_names == list(sequences_by_name.keys())

# Calculate Distance Matrix
hamming_distances = get_hamming_distances(list(sequences_by_name.values()), args.indel_distance)
new_distance_matrix = pd.DataFrame(squareform(hamming_distances))
new_distance_matrix.index = sequence_names

if distance_matrix is None:
distance_matrix = new_distance_matrix
else:
distance_matrix += new_distance_matrix

if (set_index):
distance_matrix.index = sequence_names

# Load embedding parameters from an external CSV file, if possible.
external_embedding_parameters = None
Expand Down Expand Up @@ -279,12 +312,13 @@ def embed(args):
for sequence in Bio.SeqIO.parse(alignment, "fasta"):
sequences_by_name[sequence.id] = str(sequence.seq)

seq_sorted = sorted(list(sequences_by_name.keys()))
if (sequence_names is not None):
if (sequence_names != list(sequences_by_name.keys)):
if (sequence_names != seq_sorted):
print("The strains in the multiple alignments must match for concatenating them", file=sys.stderr)
sys.exit(1)

sequence_names = list(sequences_by_name.keys())
sequence_names = seq_sorted

numbers = list(sequences_by_name.values())[:]
for i in range(0,len(list(sequences_by_name.values()))):
Expand All @@ -297,9 +331,8 @@ def embed(args):
genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
else:
second_df = pd.DataFrame(numbers)
second_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
genomes_df = genomes_df.add(second_df)

second_df.columns = ["Site " + str(k) for k in range(genomes_df.shape[1],genomes_df.shape[1] + len(numbers[i]))]
genomes_df = pd.concat([genomes_df, second_df], axis=1)
#performing PCA on my pandas dataframe
pca = PCA(
n_components=n_components,
Expand Down Expand Up @@ -357,6 +390,7 @@ def embed(args):
embedding_parameters[key] = value_type(value)

if args.command != "pca":
#TODO: distance matrices are no longer symmetrics/not square? Check into this
embedder = embedding_class(**embedding_parameters)
embedding = embedder.fit_transform(distance_matrix)

Expand Down
1 change: 1 addition & 0 deletions tests/pathogen-embed-multiple-alignment-mds.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ There should be 1 entry for the stress of the MDS embedding.

$ wc -l < embed_mds_stress.csv
\s*1 (re)

0 comments on commit 05b2d03

Please sign in to comment.