Skip to content

Commit

Permalink
Merge pull request #269 from JacksonBurns/fix/sklearn_extra
Browse files Browse the repository at this point in the history
Drop k-medioids Clustering
  • Loading branch information
JacksonBurns authored Apr 15, 2024
2 parents cc38d0a + 88b080f commit 3526835
Show file tree
Hide file tree
Showing 8 changed files with 11 additions and 49 deletions.
4 changes: 2 additions & 2 deletions AIMSim-demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@
"##### Cluster\n",
"Use a clustering algorithm to make groups from the database of molecules.\n",
" - `n_clusters`: The number of clusters to group the molecules into.\n",
" - `clustering_method`: Optional string specifying a clustering method implemented in `sklearn`, one of `kmedoids`, `ward`, or `complete_linkage`. `complete_linkage` will be chosen by default if no alternative is provided.\n",
" - `clustering_method`: Optional string specifying a clustering method implemented in `sklearn`, one of `ward`, or `complete_linkage`. `complete_linkage` will be chosen by default if no alternative is provided.\n",
" - `log_file_path`: String specifying a file to write output to for the execution of this task. Useful for debugging.\n",
" - `cluster_file_path`: String specifying a file path where _AIMSim_ will output the result of clustering. Useful for comparing multiple clustering approaches or saving the results of large data sets.\n",
" - `cluster_plot_settings`: Control the appearance of the clustering plot.\n",
Expand Down Expand Up @@ -666,7 +666,7 @@
"\n",
"clustering = ClusterData(\n",
" n_clusters=5, # data is clustered into 5 clusters\n",
" clustering_method=\"kmedoids\",\n",
" clustering_method=\"ward\",\n",
" embedding_plot_settings={\"embedding\": {\"method\": \"pca\"}},\n",
")\n",
"clustering(molecule_set)"
Expand Down
2 changes: 1 addition & 1 deletion aimsim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
except ImportError:
pass # aimsim_core does not include this

__version__ = "2.1.3"
__version__ = "2.2.0"
10 changes: 2 additions & 8 deletions aimsim/chemical_datastructures/molecule_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ class MoleculeSet:
in the dataset.
cluster(n_clusters=8, clustering_method=None, **kwargs): Cluster
the molecules of the MoleculeSet. Implemented methods.
'kmedoids': for the K-Medoids algorithm.
'complete_linkage', 'complete':
Complete linkage agglomerative hierarchical
clustering.
Expand Down Expand Up @@ -997,10 +996,6 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
clustering_method (str): Clustering algorithm to use. Default is
None in which case the algorithm is chosen from the
similarity measure in use. Implemented clustering_methods are:
'kmedoids': for the K-Medoids algorithm [1].
This method is useful
when the molecular descriptors are continuous / Euclidean
since it relies on the existence of a sensible medoid.
'complete_linkage', 'complete':
Complete linkage agglomerative hierarchical clustering [2].
'average_linkage', 'average':
Expand All @@ -1013,7 +1008,6 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
kwargs (keyword args): Key word arguments to supply to clustering
algorithm. See the documentation pages
listed below for these arguments:
'kmedoids': https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html
'complete_linkage', 'average_linkage', 'single_linkage', 'ward'
: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
Expand All @@ -1037,7 +1031,7 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
"meaningful results."
)
if (
clustering_method == "kmedoids" or clustering_method == "ward"
clustering_method == "ward"
) and self.similarity_measure.type_ == "discrete":
print(
f"{clustering_method} cannot be used with "
Expand All @@ -1047,7 +1041,7 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
clustering_method = None
if clustering_method is None:
if self.similarity_measure.type_ == "continuous":
clustering_method = "kmedoids"
clustering_method = "ward"
else:
clustering_method = "complete_linkage"
self.clusters_ = Cluster(
Expand Down
35 changes: 2 additions & 33 deletions aimsim/ops/clustering.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Operation for clustering molecules"""
import sklearn.exceptions
from sklearn.cluster import AgglomerativeClustering
from sklearn_extra.cluster import KMedoids as SklearnExtraKMedoids


class Cluster:
Expand All @@ -10,10 +9,6 @@ class Cluster:
Attributes:
clustering_method (str):
Label for the specific algorithm used.
'kmedoids':
for the K-Medoids algorithm [1]. This method is useful
when the molecular descriptors are continuous / Euclidean
since it relies on the existence of a sensible medoid.
'complete_linkage', 'complete':
Complete linkage agglomerative hierarchical clustering [2].
'average_linkage', 'average':
Expand All @@ -25,7 +20,7 @@ class Cluster:
Euclidean descriptors.
n_clusters (int):
Number of clusters.
model_ (sklearn.cluster.AgglomerativeClustering or sklearn_extra.cluster.KMedoids):
model_ (sklearn.cluster.AgglomerativeClustering):
The clustering estimator.
labels_ (np.ndarray of shape (n_samples,)):
cluster labels of the training set samples.
Expand All @@ -50,11 +45,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
Args:
n_clusters (int): Number of clusters.
clustering_method(str): Label for the specific algorithm used.
Supported methods are:
'kmedoids' for the K-Medoids algorithm [1]. This method is
useful when the molecular descriptors are continuous
/ Euclidean since it relies on the existence of a
sensible medoid.
'complete_linkage', 'complete' for complete linkage
agglomerative hierarchical clustering [2].
'average_linkage', 'average' for average linkage agglomerative
Expand All @@ -65,7 +55,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
Euclidean descriptors.
kwargs (dict): Keyword arguments. These are passed to the
estimators. Refer to the following documentation page for
kmedoids: https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html
agglomerative hierarchical clustering: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
References:
Expand All @@ -78,9 +67,7 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
"""
self.clustering_method = clustering_method
self.n_clusters = n_clusters
if self.clustering_method == "kmedoids":
self.model_ = self._get_kmedoids_model_(**kwargs)
elif clustering_method in ["complete_linkage", "complete"]:
if clustering_method in ["complete_linkage", "complete"]:
self.model_ = self._get_linkage_model(linkage_method="complete",
**kwargs)
elif clustering_method in ["average", "average_linkage"]:
Expand All @@ -95,24 +82,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
else:
raise ValueError(f"{clustering_method} not implemented")

def _get_kmedoids_model_(self, **kwargs):
"""
Initialize a k-medoids model.
Args:
kwargs (dict): Keyword arguments. These are passed to the
estimators. Refer to the following documentation page for
kmedoids:
[https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html]
"""
_ = kwargs.pop('metric', None)
return SklearnExtraKMedoids(
n_clusters=self.n_clusters,
metric="precomputed",
**kwargs
)

def _get_linkage_model(self, linkage_method, **kwargs):
_ = kwargs.pop('affinity', None)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ tasks:
plot_color: '#FD6F96'
cluster:
n_clusters: 2
clustering_method: kmedoids
clustering_method: ward
log_file_path: '/Users/himaghnabhattacharjee/Documents/Research/AIMSim_project/AIMSim/examples/Wang-et-al-log-partition-coefficients/log/cluster_log.txt'
cluster_file_path: 'log/clusters.yml'
cluster_plot_settings:
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ seaborn
tabulate
numpy
multiprocess>=0.70
scikit_learn_extra
pandas
# force pyyaml away from specific versions: https://github.com/yaml/pyyaml/issues/724
pyyaml!=6.0.0,!=5.4.0,!=5.4.1,<7
Expand Down
1 change: 0 additions & 1 deletion requirements_core.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
psutil
scikit_learn
scikit_learn_extra
rdkit
numpy
pandas
Expand Down
5 changes: 3 additions & 2 deletions tests/test_MoleculeSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,7 @@ def test_invalid_transform_error(self):
)
remove(csv_fpath)

@unittest.skip(reason="kmedoids was removed, obsoleting this test")
def test_clustering_fingerprints(self):
"""
Test the clustering of molecules featurized by their fingerprints.
Expand Down Expand Up @@ -1290,8 +1291,8 @@ def test_clustering_fingerprints(self):
if molecule_set.similarity_measure.type_ == "continuous":
self.assertEqual(
str(molecule_set.clusters_),
"kmedoids",
f"Expected kmedoids clustering for "
"ward",
f"Expected ward clustering for "
f"similarity: {similarity_measure}",
)
else:
Expand Down

0 comments on commit 3526835

Please sign in to comment.