Adding new feature that allows for multiple alignments + tests

blab · Mar 29, 2024 · 5805c58 · huddlej · Apr 9, 2024 · 5805c58
1 parent 38c859d
commit 5805c58
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 31 deletions.
diff --git a/src/pathogen_embed/__main__.py b/src/pathogen_embed/__main__.py
@@ -17,8 +17,8 @@ def autoOrFloat(values):
 def make_parser_embed():
     parser = argparse.ArgumentParser(description = "Reduced dimension embeddings for pathogen sequences", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument("--alignment", required = True, help="an aligned FASTA file to create a distance matrix with. Make sure the strain order in this file matches the order in the distance matrix.")
-    parser.add_argument("--distance-matrix", help="a distance matrix that can be read in by pandas, index column as row 0")
+    parser.add_argument("--alignment", nargs="+", required = True, help="an aligned FASTA file (or files) to create a distance matrix with. Make sure the strain order in this file matches the order in the distance matrix. If adding more than one alignment, make sure the order of the strains and strain names are the same between all the files.")
+    parser.add_argument("--distance-matrix", nargs="+", help="a distance matrix (or matrices) that can be read in by pandas, index column as row 0.  If adding more than one distance matrix, make sure the order of the strains and strain names in the header are the same between all the files.")
     parser.add_argument("--separator", default=",", help="separator between columns in the given distance matrix")
     parser.add_argument("--indel-distance", action="store_true", help="include insertions/deletions in genetic distance calculations")
     parser.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")

diff --git a/src/pathogen_embed/pathogen_embed.py b/src/pathogen_embed/pathogen_embed.py
@@ -184,6 +184,22 @@ def get_hamming_distances(genomes, count_indels=False):
 
     return hamming_distances
 
+def concat_distance_matrices(matrices):
+    # TODO: add error checking? 
+    result = matrices[0]
+
+    result_df = pd.read_csv(result, header=None)
+
+    result_arr = result_df.values.astype(float)
+
+    # Add the numpy arrays element-wise
+    for matrix in matrices[1:]:
+        other_df = pd.read_csv(matrix, header=None)
+        other_arr = other_df.values.astype(float)
+
+        result_arr = result_arr + other_arr
+
+    return result_arr
 
 def embed(args):
     # Setting Random seed for numpy
@@ -197,26 +213,45 @@ def embed(args):
         print("You must specify an alignment for pca, not a distance matrix", file=sys.stderr)
         sys.exit(1)
 
+    if args.alignment is not None and args.distance_matrix is not None and len(args.alignment) != len(args.distance_matrix):
+        print("If giving multiple alignments and distance matrices the number of both must match", file=sys.stderr)
+        sys.exit(1)
+
     # getting or creating the distance matrix
     distance_matrix = None
-    if args.distance_matrix is not None:
-        distance_matrix  = pd.read_csv(args.distance_matrix, index_col=0)
-
-    if args.alignment is not None:
-        sequences_by_name = OrderedDict()
-
-        for sequence in Bio.SeqIO.parse(args.alignment, "fasta"):
-            sequences_by_name[sequence.id] = str(sequence.seq)
-
-        sequence_names = list(sequences_by_name.keys())
-        if args.command != "pca" and distance_matrix is None:
-            # Calculate Distance Matrix
-            hamming_distances = get_hamming_distances(
-                list(sequences_by_name.values()),
-                args.indel_distance,
-            )
-            distance_matrix = pd.DataFrame(squareform(hamming_distances))
-            distance_matrix.index = sequence_names
+    if args.distance_matrix is not None and args.command != "pca":
+        distance_matrix = concat_distance_matrices(args.distance_matrix)
+
+    # if alignments and no distance matrices
+    sequence_names = None
+    if args.alignment is not None and distance_matrix is None:
+
+        for alignment in args.alignment:
+            curr_matrix = None
+            # TODO: check the concatenation logic
+            if (distance_matrix is not None):
+                curr_matrix = distance_matrix 
+
+            sequences_by_name = OrderedDict()
+
+            for sequence in Bio.SeqIO.parse(alignment, "fasta"):
+                sequences_by_name[sequence.id] = str(sequence.seq)
+
+            if (sequence_names is not None):
+                if (sequence_names != list(sequences_by_name.keys)):
+                    print("The strains in the multiple alignments must match for concatenating them", file=sys.stderr)
+                    sys.exit(1)
+
+            sequence_names = list(sequences_by_name.keys())
+            if args.command != "pca" and distance_matrix is None:
+                # Calculate Distance Matrix
+                hamming_distances = get_hamming_distances(
+                    list(sequences_by_name.values()),
+                    args.indel_distance,
+                )
+                distance_matrix = pd.DataFrame(squareform(hamming_distances))
+                distance_matrix.index = sequence_names
+                distance_matrix = curr_matrix + distance_matrix
 
     # Load embedding parameters from an external CSV file, if possible.
     external_embedding_parameters = None
@@ -235,17 +270,35 @@ def embed(args):
 
     # Use PCA as its own embedding or as an initialization for t-SNE.
     if args.command == "pca" or args.command == "t-sne":
-        sequence_names = list(sequences_by_name.keys())
-
-        numbers = list(sequences_by_name.values())[:]
-        for i in range(0,len(list(sequences_by_name.values()))):
-            numbers[i] = re.sub(r'[^AGCT]', '5', numbers[i])
-            numbers[i] = list(numbers[i].replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
-            numbers[i] = [int(j) for j in numbers[i]]
-
-        genomes_df = pd.DataFrame(numbers)
-        genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
-
+        genomes_df = None
+        sequence_names = None
+        for alignment in args.alignment:
+
+            sequences_by_name = OrderedDict()
+
+            for sequence in Bio.SeqIO.parse(alignment, "fasta"):
+                sequences_by_name[sequence.id] = str(sequence.seq)
+
+            if (sequence_names is not None):
+                if (sequence_names != list(sequences_by_name.keys)):
+                    print("The strains in the multiple alignments must match for concatenating them", file=sys.stderr)
+                    sys.exit(1)
+
+            sequence_names = list(sequences_by_name.keys())
+
+            numbers = list(sequences_by_name.values())[:]
+            for i in range(0,len(list(sequences_by_name.values()))):
+                numbers[i] = re.sub(r'[^AGCT]', '5', numbers[i])
+                numbers[i] = list(numbers[i].replace('A','1').replace('G','2').replace('C', '3').replace('T','4'))
+                numbers[i] = [int(j) for j in numbers[i]]
+
+            if genomes_df is None:
+                genomes_df = pd.DataFrame(numbers)
+                genomes_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
+            else:
+                second_df = pd.DataFrame(numbers)
+                second_df.columns = ["Site " + str(k) for k in range(0,len(numbers[i]))]
+                genomes_df = genomes_df.add(second_df)
 
         #performing PCA on my pandas dataframe
         pca = PCA(

diff --git a/tests/pathogen-embed-multiple-alignment-mds.t b/tests/pathogen-embed-multiple-alignment-mds.t
@@ -0,0 +1,17 @@
+Run pathogen-embed with MDS on a H3N2 HA and H3N2 NA alignments.
+
+  $ pathogen-embed \
+  >   --alignment $TESTDIR/data/h3n2_ha_alignment.fasta $TESTDIR/data/h3n2_na_alignment.fasta \
+  >   --output-dataframe embed_mds.csv \
+  >   mds \
+  >   --components 2 \
+  >   --stress embed_mds_stress.csv
+
+There should be one record in the embedding per input sequence in the alignment.
+
+  $ [[ $(sed 1d embed_mds.csv | wc -l) == $(grep "^>" $TESTDIR/data/h3n2_ha_alignment.fasta | wc -l) ]]
+
+There should be 1 entry for the stress of the MDS embedding.
+
+  $ wc -l < embed_mds_stress.csv
+  \s*1 (re)
diff --git a/tests/pathogen-embed-multiple-alignment-pca.t b/tests/pathogen-embed-multiple-alignment-pca.t
@@ -0,0 +1,17 @@
+Run pathogen-embed with PCA on a H3N2 HA and H3N2 NA alignments.
+
+  $ pathogen-embed \
+  >   --alignment $TESTDIR/data/h3n2_ha_alignment.fasta $TESTDIR/data/h3n2_na_alignment.fasta \
+  >   --output-dataframe embed_pca.csv \
+  >   pca \
+  >   --components 2 \
+  >   --explained-variance embed_pca_variance.csv
+
+There should be one record in the embedding per input sequence in the alignment.
+
+  $ [[ $(sed 1d embed_pca.csv | wc -l) == $(grep "^>" $TESTDIR/data/h3n2_ha_alignment.fasta | wc -l) ]]
+
+There should be 2 components of variance explained.
+
+  $ sed 1d embed_pca_variance.csv | wc -l
+  \s*2 (re)