Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable clustering by distance matrix input #33

Merged
merged 4 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# CHANGELOG

## 2.3.0

### Features

* pathogen-cluster: Add `--distance-matrix` input argument to support HDBSCAN clustering of genetic distances from `pathogen-distance` ([#33][])

[#33]: https://github.com/blab/pathogen-embed/pull/33

## 2.2.1

### Bug Fixes
Expand Down
3 changes: 0 additions & 3 deletions src/pathogen_embed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,5 @@

Reduced dimension embeddings for pathogen sequences.
"""

from .__main__ import make_parser_cluster, make_parser_distance, make_parser_embed

__author__ = 'Sravani Nanduri, John Huddleston'
__credits__ = 'Bedford Lab, Vaccine and Infectious Disease Division, Fred Hutchinson Cancer Center, Seattle, WA, USA'
39 changes: 30 additions & 9 deletions src/pathogen_embed/__main__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import argparse
import sys
from sys import argv
from .pathogen_embed import embed, distance, cluster

def autoOrFloat(values):
if values == "auto":
Expand Down Expand Up @@ -76,25 +75,47 @@ def make_parser_distance():
def make_parser_cluster():
parser = argparse.ArgumentParser(description = "HDBSCAN clustering for reduced dimension embeddings", formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("--embedding", required = True, help="The embedding to assign clustering labels to via HDBSCAN (https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html)")
parser.add_argument("--label-attribute", help="the name of the cluster used to label the column in the resulting dataframe")
parser.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
parser.add_argument("--min-size", type=int, default=5, help="minimum cluster size for HDBSCAN")
parser.add_argument("--min-samples", type=int, default=5, help="minimum number of sample to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
parser.add_argument("--distance-threshold", type=float, help="The float value for the distance threshold by which to cluster data in the embedding and assign labels via HDBSCAN. If no value is given in distance-threshold, the default distance threshold of 0.0 will be used.")
parser.add_argument("--output-dataframe", required = True, help="a csv file outputting the embedding with the strain name and its components.")
parser.add_argument("--output-figure", help="outputs a PDF with a plot of the embedding colored by cluster")
input_group = parser.add_argument_group(
"Input data",
"Choose an input data format from either an embedding or a distance matrix.",
)
exclusive_input_group = input_group.add_mutually_exclusive_group(required=True)
exclusive_input_group.add_argument("--embedding", help="an embedding to assign cluster labels to using Euclidean distance between input records")
exclusive_input_group.add_argument("--distance-matrix", help="a distance matrix to assign cluster labels to using the given precomputed values as the distance between input records")

options_group = parser.add_argument_group(
"Options",
"Options to control clustering",
)
options_group.add_argument("--label-attribute", help="the name of the cluster used to label the column in the resulting dataframe")
options_group.add_argument("--random-seed", default = 314159, type=int, help="an integer used for reproducible results.")
options_group.add_argument("--min-size", type=int, default=5, help="minimum cluster size for HDBSCAN")
options_group.add_argument("--min-samples", type=int, default=5, help="minimum number of sample to seed a cluster for HDBSCAN. Lowering this value reduces number of samples that do not get clustered.")
options_group.add_argument("--distance-threshold", type=float, help="The float value for the distance threshold by which to cluster data in the embedding and assign labels via HDBSCAN. If no value is given in distance-threshold, the default distance threshold of 0.0 will be used.")

output_group = parser.add_argument_group(
"Output",
"Output files produced by clustering"
)
output_group.add_argument("--output-dataframe", required = True, help="a csv file outputting the embedding with the strain name and its components.")
output_group.add_argument("--output-figure", help="outputs a PDF with a plot of the embedding colored by cluster")

return parser

def run_embed():
args = make_parser_embed().parse_args(argv[1:])

from .pathogen_embed import embed
return embed(args)

def run_distance():
args = make_parser_distance().parse_args(argv[1:])

from .pathogen_embed import distance
return distance(args)

def run_cluster():
args = make_parser_cluster().parse_args(argv[1:])

from .pathogen_embed import cluster
return cluster(args)
66 changes: 41 additions & 25 deletions src/pathogen_embed/pathogen_embed.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,15 @@
# Ignore warnings from Numba deprecation:
# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
# Numba is required by UMAP.
from numba.core.errors import NumbaDeprecationWarning
import warnings

warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
import sys

import matplotlib; matplotlib.set_loglevel("critical")
import argparse
import Bio.AlignIO
import Bio.SeqIO
from collections import OrderedDict
import hdbscan
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import numpy as np
import pandas as pd
import re
from scipy.spatial.distance import squareform, pdist
from scipy.stats import linregress
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
import sys
from umap import UMAP


def distance_tick_formatter(tick_val, tick_pos):
return str(int(tick_val))
Expand Down Expand Up @@ -76,6 +63,8 @@ def encode_alignment_for_pca_by_integer(alignment_path):
sequence.

"""
import re

sequences_by_name = OrderedDict()

for sequence in Bio.SeqIO.parse(alignment_path, "fasta"):
Expand Down Expand Up @@ -205,6 +194,8 @@ def encode_alignment_for_pca_by_biallelic(alignment_path):
sequence.

"""
import Bio.AlignIO

valid_nucleotides = {"A", "C", "G", "T"}
alignment = Bio.AlignIO.read(alignment_path, "fasta")
reference_sequence = str(alignment[0].seq)
Expand Down Expand Up @@ -356,6 +347,19 @@ def get_hamming_distances(genomes, count_indels=False):
return hamming_distances

def embed(args):
# Ignore warnings from Numba deprecation:
# https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
# Numba is required by UMAP.
from numba.core.errors import NumbaDeprecationWarning
import warnings

warnings.simplefilter('ignore', category=NumbaDeprecationWarning)

from scipy.stats import linregress
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from umap import UMAP

# Setting Random seed for numpy
np.random.seed(seed=args.random_seed)

Expand Down Expand Up @@ -673,29 +677,41 @@ def embed(args):
plt.savefig(args.output_pairwise_distance_figure)

def cluster(args):
import hdbscan

if not args.embedding.endswith('.csv'):
print("You must supply a CSV file for the embedding.", file=sys.stderr)
sys.exit(1)
else:
embedding_df = pd.read_csv(args.embedding, index_col=0)
if args.embedding:
if args.embedding.endswith(".csv"):
data_df = pd.read_csv(args.embedding, index_col=0)
metric = "euclidean"
else:
print("You must supply a CSV file for the embedding.", file=sys.stderr)
sys.exit(1)
elif args.distance_matrix:
if args.distance_matrix.endswith(".csv"):
data_df = pd.read_csv(args.distance_matrix, index_col=0)
metric = "precomputed"
else:
print("You must supply a CSV file for the distance matrix.", file=sys.stderr)
sys.exit(1)

clustering_parameters = {
"metric": metric,
**({"min_cluster_size": args.min_size} if args.min_size is not None else {}),
**({"min_samples": args.min_samples} if args.min_samples is not None else {}),
**({"cluster_selection_epsilon": args.distance_threshold} if args.distance_threshold is not None else {})
}

clusterer = hdbscan.HDBSCAN(**clustering_parameters)

clusterer.fit(embedding_df)
embedding_df[args.label_attribute] = clusterer.labels_.astype(str)
data_df = data_df.astype(float)
clusterer.fit(data_df)
data_df[args.label_attribute] = clusterer.labels_.astype(str)

if args.output_figure is not None:

plot_data = {
"x": embedding_df.to_numpy()[:, 0],
"y": embedding_df.to_numpy()[:, 1],
"x": data_df.to_numpy()[:, 0],
"y": data_df.to_numpy()[:, 1],
}

plot_data["cluster"] = clusterer.labels_.astype(str)
Expand All @@ -722,7 +738,7 @@ def cluster(args):
plt.close()

if args.output_dataframe is not None:
embedding_df.to_csv(args.output_dataframe, index_label="strain")
data_df.to_csv(args.output_dataframe, index_label="strain")

def calculate_distances_from_alignment(alignment_path, indel_distance):
sequences_by_name = OrderedDict()
Expand Down
22 changes: 22 additions & 0 deletions tests/pathogen-cluster-by-distances.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Get a distance matrix from a H3N2 HA alignment.

$ pathogen-distance \
> --alignment $TESTDIR/data/h3n2_ha_alignment.sorted.fasta \
> --output ha_distances.csv

Find clusters from the genetic distances.

$ pathogen-cluster \
> --distance-matrix ha_distances.csv \
> --label-attribute genetic_label \
> --distance-threshold 0.5 \
> --output-dataframe cluster_distances.csv

There should be one record in the cluster output for each record in the input distances.

$ [[ $(wc -l < cluster_distances.csv) == $(wc -l < ha_distances.csv) ]]

The header should include the index strain, each strain from the distance matrix, and the requested cluster label.

$ head -n 1 cluster_distances.csv
strain,.*,genetic_label (re)
Loading