blab · huddlej · Jul 29, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,13 @@
 # CHANGELOG
 
+## 2.3.0
+
+### Features
+
+* pathogen-cluster: Add `--distance-matrix` input argument to support HDBSCAN clustering of genetic distances from `pathogen-distance` ([#33][])
+
+[#33]: https://github.com/blab/pathogen-embed/pull/33
+
 ## 2.2.1
 
 ### Bug Fixes

diff --git a/src/pathogen_embed/__init__.py b/src/pathogen_embed/__init__.py
@@ -3,8 +3,5 @@
 
 Reduced dimension embeddings for pathogen sequences.
 """
-
-from .__main__ import make_parser_cluster, make_parser_distance, make_parser_embed
-
 __author__ = 'Sravani Nanduri, John Huddleston'
 __credits__ = 'Bedford Lab, Vaccine and Infectious Disease Division, Fred Hutchinson Cancer Center, Seattle, WA, USA'
diff --git a/src/pathogen_embed/__main__.py b/src/pathogen_embed/__main__.py
@@ -1,7 +1,6 @@
 import argparse
 import sys
 from sys import argv
-from .pathogen_embed import embed, distance, cluster
 
 def autoOrFloat(values):
     if values == "auto":
@@ -76,25 +75,47 @@ def make_parser_distance():
 def make_parser_cluster():
     parser = argparse.ArgumentParser(description = "HDBSCAN clustering for reduced dimension embeddings", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument("--embedding", required = True, help="The embedding to assign clustering labels to via HDBSCAN (https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html)")
-    parser.add_argument("--label-attribute", help="the name of the cluster used to label the column in the resulting dataframe")
-    parser.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
-    parser.add_argument("--min-size", type=int, default=5, help="minimum cluster size for HDBSCAN")
-    parser.add_argument("--min-samples", type=int, default=5, help="minimum number of sample to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
-    parser.add_argument("--distance-threshold", type=float, help="The float value for the distance threshold by which to cluster data in the embedding and assign labels via HDBSCAN. If no value is given in distance-threshold, the default distance threshold of 0.0 will be used.")
-    parser.add_argument("--output-dataframe", required = True, help="a csv file outputting the embedding with the strain name and its components.")
-    parser.add_argument("--output-figure", help="outputs a PDF with a plot of the embedding colored by cluster")
+    input_group = parser.add_argument_group(
+        "Input data",
+        "Choose an input data format from either an embedding or a distance matrix.",
+    )
+    exclusive_input_group = input_group.add_mutually_exclusive_group(required=True)
+    exclusive_input_group.add_argument("--embedding", help="an embedding to assign cluster labels to using Euclidean distance between input records")
+    exclusive_input_group.add_argument("--distance-matrix", help="a distance matrix to assign cluster labels to using the given precomputed values as the distance between input records")
+
+    options_group = parser.add_argument_group(
+        "Options",
+        "Options to control clustering",
+    )
+    options_group.add_argument("--label-attribute", help="the name of the cluster used to label the column in the resulting dataframe")
+    options_group.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
+    options_group.add_argument("--min-size", type=int, default=5, help="minimum cluster size for HDBSCAN")
+    options_group.add_argument("--min-samples", type=int, default=5, help="minimum number of sample to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
+    options_group.add_argument("--distance-threshold", type=float, help="The float value for the distance threshold by which to cluster data in the embedding and assign labels via HDBSCAN. If no value is given in distance-threshold, the default distance threshold of 0.0 will be used.")
+
+    output_group = parser.add_argument_group(
+        "Output",
+        "Output files produced by clustering"
+    )
+    output_group.add_argument("--output-dataframe", required = True, help="a csv file outputting the embedding with the strain name and its components.")
+    output_group.add_argument("--output-figure", help="outputs a PDF with a plot of the embedding colored by cluster")
 
     return parser
 
 def run_embed():
     args = make_parser_embed().parse_args(argv[1:])
+
+    from .pathogen_embed import embed
     return embed(args)
 
 def run_distance():
     args = make_parser_distance().parse_args(argv[1:])
+
+    from .pathogen_embed import distance
     return distance(args)
 
 def run_cluster():
     args = make_parser_cluster().parse_args(argv[1:])
+
+    from .pathogen_embed import cluster
     return cluster(args)
diff --git a/src/pathogen_embed/pathogen_embed.py b/src/pathogen_embed/pathogen_embed.py
@@ -1,28 +1,15 @@
-# Ignore warnings from Numba deprecation:
-# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
-# Numba is required by UMAP.
-from numba.core.errors import NumbaDeprecationWarning
-import warnings
-
-warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
+import sys
 
 import matplotlib; matplotlib.set_loglevel("critical")
 import argparse
-import Bio.AlignIO
 import Bio.SeqIO
 from collections import OrderedDict
-import hdbscan
 import matplotlib.pyplot as plt
 from matplotlib.ticker import MultipleLocator
 import numpy as np
 import pandas as pd
-import re
 from scipy.spatial.distance import squareform, pdist
-from scipy.stats import linregress
-from sklearn.decomposition import PCA
-from sklearn.manifold import TSNE, MDS
-import sys
-from umap import UMAP
+
 
 def distance_tick_formatter(tick_val, tick_pos):
     return str(int(tick_val))
@@ -76,6 +63,8 @@ def encode_alignment_for_pca_by_integer(alignment_path):
     sequence.
 
     """
+    import re
+
     sequences_by_name = OrderedDict()
 
     for sequence in Bio.SeqIO.parse(alignment_path, "fasta"):
@@ -205,6 +194,8 @@ def encode_alignment_for_pca_by_biallelic(alignment_path):
     sequence.
 
     """
+    import Bio.AlignIO
+
     valid_nucleotides = {"A", "C", "G", "T"}
     alignment = Bio.AlignIO.read(alignment_path, "fasta")
     reference_sequence = str(alignment[0].seq)
@@ -356,6 +347,19 @@ def get_hamming_distances(genomes, count_indels=False):
     return hamming_distances
 
 def embed(args):
+    # Ignore warnings from Numba deprecation:
+    # https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
+    # Numba is required by UMAP.
+    from numba.core.errors import NumbaDeprecationWarning
+    import warnings
+
+    warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
+
+    from scipy.stats import linregress
+    from sklearn.decomposition import PCA
+    from sklearn.manifold import TSNE, MDS
+    from umap import UMAP
+
     # Setting Random seed for numpy
     np.random.seed(seed=args.random_seed)
 
@@ -673,29 +677,41 @@ def embed(args):
         plt.savefig(args.output_pairwise_distance_figure)
 
 def cluster(args):
+    import hdbscan
 
-    if not args.embedding.endswith('.csv'):
-        print("You must supply a CSV file for the embedding.", file=sys.stderr)
-        sys.exit(1)
-    else:
-        embedding_df = pd.read_csv(args.embedding, index_col=0)
+    if args.embedding:
+        if args.embedding.endswith(".csv"):
+            data_df = pd.read_csv(args.embedding, index_col=0)
+            metric = "euclidean"
+        else:
+            print("You must supply a CSV file for the embedding.", file=sys.stderr)
+            sys.exit(1)
+    elif args.distance_matrix:
+        if args.distance_matrix.endswith(".csv"):
+            data_df = pd.read_csv(args.distance_matrix, index_col=0)
+            metric = "precomputed"
+        else:
+            print("You must supply a CSV file for the distance matrix.", file=sys.stderr)
+            sys.exit(1)
 
     clustering_parameters = {
+        "metric": metric,
         **({"min_cluster_size": args.min_size} if args.min_size is not None else {}),
         **({"min_samples": args.min_samples} if args.min_samples is not None else {}),
         **({"cluster_selection_epsilon": args.distance_threshold} if args.distance_threshold is not None else {})
     }
 
     clusterer = hdbscan.HDBSCAN(**clustering_parameters)
 
-    clusterer.fit(embedding_df)
-    embedding_df[args.label_attribute] = clusterer.labels_.astype(str)
+    data_df = data_df.astype(float)
+    clusterer.fit(data_df)
+    data_df[args.label_attribute] = clusterer.labels_.astype(str)
 
     if args.output_figure is not None:
 
         plot_data = {
-            "x": embedding_df.to_numpy()[:, 0],
-            "y": embedding_df.to_numpy()[:, 1],
+            "x": data_df.to_numpy()[:, 0],
+            "y": data_df.to_numpy()[:, 1],
         }
 
         plot_data["cluster"] = clusterer.labels_.astype(str)
@@ -722,7 +738,7 @@ def cluster(args):
         plt.close()
 
     if args.output_dataframe is not None:
-        embedding_df.to_csv(args.output_dataframe, index_label="strain")
+        data_df.to_csv(args.output_dataframe, index_label="strain")
 
 def calculate_distances_from_alignment(alignment_path, indel_distance):
     sequences_by_name = OrderedDict()

diff --git a/tests/pathogen-cluster-by-distances.t b/tests/pathogen-cluster-by-distances.t
@@ -0,0 +1,22 @@
+Get a distance matrix from a H3N2 HA alignment.
+
+  $ pathogen-distance \
+  >   --alignment $TESTDIR/data/h3n2_ha_alignment.sorted.fasta \
+  >   --output ha_distances.csv
+
+Find clusters from the genetic distances.
+
+  $ pathogen-cluster \
+  >   --distance-matrix ha_distances.csv \
+  >   --label-attribute genetic_label \
+  >   --distance-threshold 0.5 \
+  >   --output-dataframe cluster_distances.csv
+
+There should be one record in the cluster output for each record in the input distances.
+
+  $ [[ $(wc -l < cluster_distances.csv) == $(wc -l < ha_distances.csv) ]]
+
+The header should include the index strain, each strain from the distance matrix, and the requested cluster label.
+
+  $ head -n 1 cluster_distances.csv
+  strain,.*,genetic_label (re)