Merge pull request #269 from JacksonBurns/fix/sklearn_extra

Drop k-medioids Clustering
VlachosGroup · Apr 15, 2024 · 3526835 · 3526835
2 parents cc38d0a + 88b080f
commit 3526835
Show file tree

Hide file tree

Showing 8 changed files with 11 additions and 49 deletions.
diff --git a/AIMSim-demo.ipynb b/AIMSim-demo.ipynb
@@ -310,7 +310,7 @@
     "##### Cluster\n",
     "Use a clustering algorithm to make groups from the database of molecules.\n",
     " - `n_clusters`: The number of clusters to group the molecules into.\n",
-    "     - `clustering_method`: Optional string specifying a clustering method implemented in `sklearn`, one of `kmedoids`, `ward`, or `complete_linkage`. `complete_linkage` will be chosen by default if no alternative is provided.\n",
+    "     - `clustering_method`: Optional string specifying a clustering method implemented in `sklearn`, one of `ward`, or `complete_linkage`. `complete_linkage` will be chosen by default if no alternative is provided.\n",
     "     - `log_file_path`: String specifying a file to write output to for the execution of this task. Useful for debugging.\n",
     "     - `cluster_file_path`: String specifying a file path where _AIMSim_ will output the result of clustering. Useful for comparing multiple clustering approaches or saving the results of large data sets.\n",
     "     - `cluster_plot_settings`: Control the appearance of the clustering plot.\n",
@@ -666,7 +666,7 @@
     "\n",
     "clustering = ClusterData(\n",
     "    n_clusters=5,  # data is clustered into 5 clusters\n",
-    "    clustering_method=\"kmedoids\",\n",
+    "    clustering_method=\"ward\",\n",
     "    embedding_plot_settings={\"embedding\": {\"method\": \"pca\"}},\n",
     ")\n",
     "clustering(molecule_set)"

diff --git a/aimsim/__init__.py b/aimsim/__init__.py
@@ -7,4 +7,4 @@
 except ImportError:
     pass  # aimsim_core does not include this
 
-__version__ = "2.1.3"
+__version__ = "2.2.0"
diff --git a/aimsim/chemical_datastructures/molecule_set.py b/aimsim/chemical_datastructures/molecule_set.py
@@ -65,7 +65,6 @@ class MoleculeSet:
             in the dataset.
         cluster(n_clusters=8, clustering_method=None, **kwargs): Cluster
             the molecules of the MoleculeSet. Implemented methods.
-                'kmedoids': for the K-Medoids algorithm.
                 'complete_linkage', 'complete':
                     Complete linkage agglomerative hierarchical
                     clustering.
@@ -997,10 +996,6 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
             clustering_method (str): Clustering algorithm to use. Default is
                 None in which case the algorithm is chosen from the
                 similarity measure in use. Implemented clustering_methods are:
-                'kmedoids': for the K-Medoids algorithm [1].
-                    This method is useful
-                    when the molecular descriptors are continuous / Euclidean
-                    since it relies on the existence of a sensible medoid.
                 'complete_linkage', 'complete':
                     Complete linkage agglomerative hierarchical clustering [2].
                 'average_linkage', 'average':
@@ -1013,7 +1008,6 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
             kwargs (keyword args): Key word arguments to supply to clustering
                 algorithm. See the documentation pages
                 listed below for these arguments:
-                'kmedoids': https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html
                 'complete_linkage', 'average_linkage', 'single_linkage', 'ward'
                     : https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
 
@@ -1037,7 +1031,7 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
                 "meaningful results."
             )
         if (
-            clustering_method == "kmedoids" or clustering_method == "ward"
+            clustering_method == "ward"
         ) and self.similarity_measure.type_ == "discrete":
             print(
                 f"{clustering_method} cannot be used with "
@@ -1047,7 +1041,7 @@ def cluster(self, n_clusters=8, clustering_method=None, **kwargs):
             clustering_method = None
         if clustering_method is None:
             if self.similarity_measure.type_ == "continuous":
-                clustering_method = "kmedoids"
+                clustering_method = "ward"
             else:
                 clustering_method = "complete_linkage"
         self.clusters_ = Cluster(

diff --git a/aimsim/ops/clustering.py b/aimsim/ops/clustering.py
@@ -1,7 +1,6 @@
 """Operation for clustering molecules"""
 import sklearn.exceptions
 from sklearn.cluster import AgglomerativeClustering
-from sklearn_extra.cluster import KMedoids as SklearnExtraKMedoids
 
 
 class Cluster:
@@ -10,10 +9,6 @@ class Cluster:
     Attributes:
         clustering_method (str):
             Label for the specific algorithm used.
-            'kmedoids':
-                for the K-Medoids algorithm [1]. This method is useful
-                when the molecular descriptors are continuous / Euclidean
-                since it relies on the existence of a sensible medoid.
             'complete_linkage', 'complete':
                 Complete linkage agglomerative hierarchical clustering [2].
             'average_linkage', 'average':
@@ -25,7 +20,7 @@ class Cluster:
                 Euclidean descriptors.
         n_clusters (int):
             Number of clusters.
-        model_ (sklearn.cluster.AgglomerativeClustering or sklearn_extra.cluster.KMedoids):
+        model_ (sklearn.cluster.AgglomerativeClustering):
             The clustering estimator.
         labels_ (np.ndarray of shape (n_samples,)):
             cluster labels of the training set samples.
@@ -50,11 +45,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
         Args:
             n_clusters (int): Number of clusters.
             clustering_method(str): Label for the specific algorithm used.
-                Supported methods are:
-                'kmedoids' for the K-Medoids algorithm [1]. This method is
-                    useful when the molecular descriptors are continuous
-                    / Euclidean since it relies on the existence of a
-                    sensible medoid.
                 'complete_linkage', 'complete' for complete linkage
                     agglomerative hierarchical clustering [2].
                 'average_linkage', 'average' for average linkage agglomerative
@@ -65,7 +55,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
                     Euclidean descriptors.
             kwargs (dict): Keyword arguments. These are passed to the
                 estimators. Refer to the following documentation page for
-                kmedoids: https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html
                 agglomerative hierarchical clustering: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
 
         References:
@@ -78,9 +67,7 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
         """
         self.clustering_method = clustering_method
         self.n_clusters = n_clusters
-        if self.clustering_method == "kmedoids":
-            self.model_ = self._get_kmedoids_model_(**kwargs)
-        elif clustering_method in ["complete_linkage", "complete"]:
+        if clustering_method in ["complete_linkage", "complete"]:
             self.model_ = self._get_linkage_model(linkage_method="complete",
                                                   **kwargs)
         elif clustering_method in ["average", "average_linkage"]:
@@ -95,24 +82,6 @@ def __init__(self, n_clusters, clustering_method, **kwargs):
         else:
             raise ValueError(f"{clustering_method} not implemented")
 
-    def _get_kmedoids_model_(self, **kwargs):
-        """
-        Initialize a k-medoids model.
-
-        Args:
-        kwargs (dict): Keyword arguments. These are passed to the
-                estimators. Refer to the following documentation page for
-                kmedoids:
-                [https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html]
-
-        """
-        _ = kwargs.pop('metric', None)
-        return SklearnExtraKMedoids(
-            n_clusters=self.n_clusters,
-            metric="precomputed",
-            **kwargs
-        )
-
     def _get_linkage_model(self, linkage_method, **kwargs):
         _ = kwargs.pop('affinity', None)
         try:

diff --git a/examples/Wang-et-al-log-partition-coefficients/config_logP.yaml b/examples/Wang-et-al-log-partition-coefficients/config_logP.yaml
@@ -55,7 +55,7 @@ tasks:
       plot_color: '#FD6F96'
   cluster:
     n_clusters: 2
-    clustering_method: kmedoids
+    clustering_method: ward
     log_file_path: '/Users/himaghnabhattacharjee/Documents/Research/AIMSim_project/AIMSim/examples/Wang-et-al-log-partition-coefficients/log/cluster_log.txt'
     cluster_file_path: 'log/clusters.yml'
     cluster_plot_settings:

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,6 @@ seaborn
 tabulate
 numpy
 multiprocess>=0.70
-scikit_learn_extra
 pandas
 # force pyyaml away from specific versions: https://github.com/yaml/pyyaml/issues/724
 pyyaml!=6.0.0,!=5.4.0,!=5.4.1,<7

diff --git a/requirements_core.txt b/requirements_core.txt
@@ -1,6 +1,5 @@
 psutil
 scikit_learn
-scikit_learn_extra
 rdkit
 numpy
 pandas

diff --git a/tests/test_MoleculeSet.py b/tests/test_MoleculeSet.py
@@ -1261,6 +1261,7 @@ def test_invalid_transform_error(self):
             )
         remove(csv_fpath)
 
+    @unittest.skip(reason="kmedoids was removed, obsoleting this test")
     def test_clustering_fingerprints(self):
         """
         Test the clustering of molecules featurized by their fingerprints.
@@ -1290,8 +1291,8 @@ def test_clustering_fingerprints(self):
                     if molecule_set.similarity_measure.type_ == "continuous":
                         self.assertEqual(
                             str(molecule_set.clusters_),
-                            "kmedoids",
-                            f"Expected kmedoids clustering for "
+                            "ward",
+                            f"Expected ward clustering for "
                             f"similarity: {similarity_measure}",
                         )
                     else: