From 4d44d7ae314f77ae62f4992d128e4014c9f95606 Mon Sep 17 00:00:00 2001 From: Umberto Date: Fri, 17 Jan 2020 18:17:11 +0100 Subject: [PATCH 1/4] More precision in documenting kind in mapper covers --- giotto/mapper/cover.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/giotto/mapper/cover.py b/giotto/mapper/cover.py index 6fa04179d..ceb8c349c 100644 --- a/giotto/mapper/cover.py +++ b/giotto/mapper/cover.py @@ -38,7 +38,7 @@ class OneDimensionalCover(BaseEstimator, TransformerMixin): Parameters ---------- - kind : str, optional, default: ``'uniform'`` + kind : ``'uniform'`` | ``'balanced'``, optional, default: ``'uniform'`` The kind of cover to use. n_intervals : int, optional, default: ``10`` @@ -343,7 +343,7 @@ class CubicalCover(BaseEstimator, TransformerMixin): Parameters ---------- - kind : str, optional, default: ``'uniform'`` + kind : ``'uniform'`` | ``'balanced'``, optional, default: ``'uniform'`` The kind of cover to use. n_intervals : int, optional, default: ``10`` From 7ceb84d1ab1353376a7fa4a15d4f5a2cee2df644 Mon Sep 17 00:00:00 2001 From: Umberto Date: Fri, 17 Jan 2020 18:24:50 +0100 Subject: [PATCH 2/4] Avoid repeated logic between fit and fit_transform in Nerve --- giotto/mapper/nerve.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/giotto/mapper/nerve.py b/giotto/mapper/nerve.py index 0958a00e5..e18fbf971 100644 --- a/giotto/mapper/nerve.py +++ b/giotto/mapper/nerve.py @@ -75,11 +75,7 @@ def fit(self, X, y=None): """ # TODO: Include a validation step for X - self.X_ = reduce(iconcat, X, []) - # Preprocess X by 1) flattening and 2) extending each tuple - self.X_ = [(node_info[0], *node_info[1]) - for node_info in zip(range(len(self.X_)), self.X_)] - self.edges_ = list(self._generate_edges(self.X_)) + self.X_, self.edges_ = self._graph_data_creation(X) return self def fit_transform(self, X, y=None, **fit_params): @@ -113,11 +109,7 @@ def fit_transform(self, X, y=None, **fit_params): """ # TODO: Include a validation step for X - _X = reduce(iconcat, X, []) - # Preprocess X by 1) flattening and 2) extending each tuple - _X = [(node_info[0], *node_info[1]) - for node_info in zip(range(len(_X)), _X)] - _edges = self._generate_edges(_X) + _X, _edges = self._graph_data_creation(X) # Graph construction graph = ig.Graph() @@ -132,6 +124,14 @@ def fit_transform(self, X, y=None, **fit_params): zip(*_X))) return graph + def _graph_data_creation(self, X): + X_ = reduce(iconcat, X, []) + # Preprocess X by 1) flattening and 2) extending each tuple + X_ = [(node_info[0], *node_info[1]) + for node_info in zip(range(len(X_)), X_)] + edges_ = self._generate_edges(X_) + return X_, edges_ + @staticmethod def _pairwise_intersections(min_intersection, node_pair): data = dict() From fb7b75bb38a93090b5fa63edbd55c6c80ce6353b Mon Sep 17 00:00:00 2001 From: Umberto Date: Fri, 17 Jan 2020 18:33:23 +0100 Subject: [PATCH 3/4] More precise description of node_id in mapper_quickstart --- examples/mapper_quickstart.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mapper_quickstart.ipynb b/examples/mapper_quickstart.ipynb index d02c19202..972b784a3 100644 --- a/examples/mapper_quickstart.ipynb +++ b/examples/mapper_quickstart.ipynb @@ -359,7 +359,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here `node_id` is an identifier used to construct the graph, while `pullback_set_label` and `partial_cluster_label` refer to the interval and cluster sets described above. The `node_elements` refers to the indices of our original data that belong to each node. For example, to find which points belong to the first node of the graph we can access the desired data as follows:" + "Here `node_id` is a globally unique node identifier used to construct the graph, while `pullback_set_label` and `partial_cluster_label` refer to the interval and cluster sets described above. The `node_elements` refers to the indices of our original data that belong to each node. For example, to find which points belong to the first node of the graph we can access the desired data as follows:" ] }, { From be5b6c7b05a432544915cab96097e50a6664a672 Mon Sep 17 00:00:00 2001 From: Umberto Date: Mon, 20 Jan 2020 10:31:18 +0100 Subject: [PATCH 4/4] Rename parallel_clustering_{n_jobs, prefer} With n_jobs and parallel_backend_prefer respectively. Also update documentation and mapper_quickstart.ipynb accordingly. --- examples/mapper_quickstart.ipynb | 8 ++++---- giotto/mapper/cluster.py | 16 ++++++++-------- giotto/mapper/pipeline.py | 31 ++++++++++++++++--------------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/examples/mapper_quickstart.ipynb b/examples/mapper_quickstart.ipynb index 972b784a3..0817a7073 100644 --- a/examples/mapper_quickstart.ipynb +++ b/examples/mapper_quickstart.ipynb @@ -137,7 +137,7 @@ "clusterer = DBSCAN()\n", "\n", "# configure parallelism of clustering step\n", - "parallel_clustering_n_jobs = 1\n", + "n_jobs = 1\n", "\n", "# initialise pipeline\n", "pipe = make_mapper_pipeline(\n", @@ -145,7 +145,7 @@ " cover=cover,\n", " clusterer=clusterer,\n", " verbose=False,\n", - " parallel_clustering_n_jobs=parallel_clustering_n_jobs,\n", + " n_jobs=n_jobs,\n", ")" ] }, @@ -430,7 +430,7 @@ " cover=cover,\n", " clusterer=clusterer,\n", " verbose=True,\n", - " parallel_clustering_n_jobs=parallel_clustering_n_jobs,\n", + " n_jobs=n_jobs,\n", ")" ] }, @@ -473,7 +473,7 @@ " cover=cover,\n", " clusterer=clusterer,\n", " verbose=True,\n", - " parallel_clustering_n_jobs=parallel_clustering_n_jobs,\n", + " n_jobs=n_jobs,\n", ")" ] }, diff --git a/giotto/mapper/cluster.py b/giotto/mapper/cluster.py index b094f31e4..ceda72d71 100644 --- a/giotto/mapper/cluster.py +++ b/giotto/mapper/cluster.py @@ -36,12 +36,12 @@ class ParallelClustering(BaseEstimator): :class:`sklearn.base.ClusterMixin`. ``None`` means that the default :class:`sklearn.cluster.DBSCAN` is used. - parallel_clustering_n_jobs : int or None, optional, default: ``None`` + n_jobs : int or None, optional, default: ``None`` The number of jobs to use for the computation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. - parallel_clustering_prefer : ``'processes'`` | ``'threads'``, optional, \ + parallel_backend_prefer : ``'processes'`` | ``'threads'``, optional, \ default: ``'threads'`` Selects the default joblib backend. The default process-based backend is 'loky' and the default thread-based backend is 'threading'. @@ -64,11 +64,11 @@ class ParallelClustering(BaseEstimator): """ def __init__(self, clusterer=None, - parallel_clustering_n_jobs=None, - parallel_clustering_prefer='threads'): + n_jobs=None, + parallel_backend_prefer='threads'): self.clusterer = clusterer - self.parallel_clustering_n_jobs = parallel_clustering_n_jobs - self.parallel_clustering_prefer = parallel_clustering_prefer + self.n_jobs = n_jobs + self.parallel_backend_prefer = parallel_backend_prefer def _validate_clusterer(self, default=DBSCAN()): """Set :attr:`clusterer_` depending on the value of `clusterer`. @@ -136,8 +136,8 @@ def fit(self, X, y=None, sample_weight=None): else: single_fitter = self._fit_single_abs_labels - self.clusterers_ = Parallel(n_jobs=self.parallel_clustering_n_jobs, - prefer=self.parallel_clustering_prefer)( + self.clusterers_ = Parallel(n_jobs=self.n_jobs, + prefer=self.parallel_backend_prefer)( delayed(single_fitter)( X_tot, np.flatnonzero(mask), mask_num, sample_weight=sample_weights[mask_num]) diff --git a/giotto/mapper/pipeline.py b/giotto/mapper/pipeline.py index 7fe98de48..b366f8528 100644 --- a/giotto/mapper/pipeline.py +++ b/giotto/mapper/pipeline.py @@ -11,8 +11,8 @@ global_pipeline_params = ('memory', 'verbose') nodes_params = ('scaler', 'filter_func', 'cover') clust_prepr_params = ('clustering_preprocessing',) -clust_params = ('clusterer', 'parallel_clustering_n_jobs', - 'parallel_clustering_prefer') +clust_params = ('clusterer', 'n_jobs', + 'parallel_backend_prefer') nerve_params = ('min_intersection',) clust_prepr_params_prefix = 'pullback_cover__' nodes_params_prefix = 'pullback_cover__map_and_cover__' @@ -145,8 +145,8 @@ def make_mapper_pipeline(scaler=None, cover=None, clustering_preprocessing=None, clusterer=None, - parallel_clustering_n_jobs=None, - parallel_clustering_prefer='threads', + n_jobs=None, + parallel_backend_prefer='threads', graph_step=True, min_intersection=1, memory=None, @@ -186,18 +186,19 @@ def make_mapper_pipeline(scaler=None, Clustering object. ``None`` means using DBSCAN (:meth:`sklearn.cluster.DBSCAN`) with its default parameters. - parallel_clustering_n_jobs : int or None, optional, default: ``None`` + n_jobs : int or None, optional, default: ``None`` The number of jobs to use in a joblib-parallel application of the - clustering step across pullback cover sets. ``None`` means 1 unless + clustering step across pullback cover sets. To be used in + conjunction with `parallel_backend_prefer`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. - parallel_clustering_prefer : ``'processes'`` | ``'threads'``, optional, \ + parallel_backend_prefer : ``'processes'`` | ``'threads'``, optional, \ default: ``'threads'`` - Selects the default joblib backend to use in a joblib-parallel - application of the clustering step across pullback cover sets. - The default process-based backend is 'loky' and the default - thread-based backend is 'threading'. See [2]_. + Soft hint for the default joblib backend to use in a joblib-parallel + application of the clustering step across pullback cover sets. To be + used in conjunction with `n_jobs`. The default process-based backend is + 'loky' and the default thread-based backend is 'threading'. See [2]_. graph_step : bool, optional, default: ``True`` Whether the resulting pipeline should stop at the calculation of the @@ -287,7 +288,7 @@ def make_mapper_pipeline(scaler=None, >>> # clustering across the pullback cover sets can be beneficial >>> from sklearn.cluster import DBSCAN >>> mapper = make_mapper_pipeline(clusterer=DBSCAN(), - ... parallel_clustering_n_jobs=6, + ... n_jobs=6, ... memory=mkdtemp(), ... verbose=True) >>> X = np.random.random((100000, 4)) @@ -298,7 +299,7 @@ def make_mapper_pipeline(scaler=None, [Pipeline] .... (step 1 of 3) Processing pullback_cover, total= 0.7s [Pipeline] ........ (step 2 of 3) Processing clustering, total= 1.9s [Pipeline] ............. (step 3 of 3) Processing nerve, total= 0.3s - >>> mapper.set_params(parallel_clustering_n_jobs=1) + >>> mapper.set_params(n_jobs=1) >>> mapper.fit_transform(X) [Pipeline] ........ (step 2 of 3) Processing clustering, total= 5.3s [Pipeline] ............. (step 3 of 3) Processing nerve, total= 0.3s @@ -366,8 +367,8 @@ def make_mapper_pipeline(scaler=None, ('map_and_cover', map_and_cover)])), ('clustering', ParallelClustering( clusterer=_clusterer, - parallel_clustering_n_jobs=parallel_clustering_n_jobs, - parallel_clustering_prefer=parallel_clustering_prefer)) + n_jobs=n_jobs, + parallel_backend_prefer=parallel_backend_prefer)) ] if graph_step: