Skip to content

Commit

Permalink
Merge pull request #189 from ulupo/minor_mapper_enhancements
Browse files Browse the repository at this point in the history
Minor mapper enhancements
  • Loading branch information
ulupo authored Jan 20, 2020
2 parents 7857e09 + be5b6c7 commit 5f9cb1d
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 40 deletions.
10 changes: 5 additions & 5 deletions examples/mapper_quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,15 @@
"clusterer = DBSCAN()\n",
"\n",
"# configure parallelism of clustering step\n",
"parallel_clustering_n_jobs = 1\n",
"n_jobs = 1\n",
"\n",
"# initialise pipeline\n",
"pipe = make_mapper_pipeline(\n",
" filter_func=filter_func,\n",
" cover=cover,\n",
" clusterer=clusterer,\n",
" verbose=False,\n",
" parallel_clustering_n_jobs=parallel_clustering_n_jobs,\n",
" n_jobs=n_jobs,\n",
")"
]
},
Expand Down Expand Up @@ -359,7 +359,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Here `node_id` is an identifier used to construct the graph, while `pullback_set_label` and `partial_cluster_label` refer to the interval and cluster sets described above. The `node_elements` refers to the indices of our original data that belong to each node. For example, to find which points belong to the first node of the graph we can access the desired data as follows:"
"Here `node_id` is a globally unique node identifier used to construct the graph, while `pullback_set_label` and `partial_cluster_label` refer to the interval and cluster sets described above. The `node_elements` refers to the indices of our original data that belong to each node. For example, to find which points belong to the first node of the graph we can access the desired data as follows:"
]
},
{
Expand Down Expand Up @@ -430,7 +430,7 @@
" cover=cover,\n",
" clusterer=clusterer,\n",
" verbose=True,\n",
" parallel_clustering_n_jobs=parallel_clustering_n_jobs,\n",
" n_jobs=n_jobs,\n",
")"
]
},
Expand Down Expand Up @@ -473,7 +473,7 @@
" cover=cover,\n",
" clusterer=clusterer,\n",
" verbose=True,\n",
" parallel_clustering_n_jobs=parallel_clustering_n_jobs,\n",
" n_jobs=n_jobs,\n",
")"
]
},
Expand Down
16 changes: 8 additions & 8 deletions giotto/mapper/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ class ParallelClustering(BaseEstimator):
:class:`sklearn.base.ClusterMixin`. ``None`` means that the default
:class:`sklearn.cluster.DBSCAN` is used.
parallel_clustering_n_jobs : int or None, optional, default: ``None``
n_jobs : int or None, optional, default: ``None``
The number of jobs to use for the computation. ``None`` means 1
unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
using all processors.
parallel_clustering_prefer : ``'processes'`` | ``'threads'``, optional, \
parallel_backend_prefer : ``'processes'`` | ``'threads'``, optional, \
default: ``'threads'``
Selects the default joblib backend. The default process-based backend
is 'loky' and the default thread-based backend is 'threading'.
Expand All @@ -64,11 +64,11 @@ class ParallelClustering(BaseEstimator):
"""

def __init__(self, clusterer=None,
parallel_clustering_n_jobs=None,
parallel_clustering_prefer='threads'):
n_jobs=None,
parallel_backend_prefer='threads'):
self.clusterer = clusterer
self.parallel_clustering_n_jobs = parallel_clustering_n_jobs
self.parallel_clustering_prefer = parallel_clustering_prefer
self.n_jobs = n_jobs
self.parallel_backend_prefer = parallel_backend_prefer

def _validate_clusterer(self, default=DBSCAN()):
"""Set :attr:`clusterer_` depending on the value of `clusterer`.
Expand Down Expand Up @@ -136,8 +136,8 @@ def fit(self, X, y=None, sample_weight=None):
else:
single_fitter = self._fit_single_abs_labels

self.clusterers_ = Parallel(n_jobs=self.parallel_clustering_n_jobs,
prefer=self.parallel_clustering_prefer)(
self.clusterers_ = Parallel(n_jobs=self.n_jobs,
prefer=self.parallel_backend_prefer)(
delayed(single_fitter)(
X_tot, np.flatnonzero(mask),
mask_num, sample_weight=sample_weights[mask_num])
Expand Down
4 changes: 2 additions & 2 deletions giotto/mapper/cover.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class OneDimensionalCover(BaseEstimator, TransformerMixin):
Parameters
----------
kind : str, optional, default: ``'uniform'``
kind : ``'uniform'`` | ``'balanced'``, optional, default: ``'uniform'``
The kind of cover to use.
n_intervals : int, optional, default: ``10``
Expand Down Expand Up @@ -343,7 +343,7 @@ class CubicalCover(BaseEstimator, TransformerMixin):
Parameters
----------
kind : str, optional, default: ``'uniform'``
kind : ``'uniform'`` | ``'balanced'``, optional, default: ``'uniform'``
The kind of cover to use.
n_intervals : int, optional, default: ``10``
Expand Down
20 changes: 10 additions & 10 deletions giotto/mapper/nerve.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,7 @@ def fit(self, X, y=None):
"""
# TODO: Include a validation step for X
self.X_ = reduce(iconcat, X, [])
# Preprocess X by 1) flattening and 2) extending each tuple
self.X_ = [(node_info[0], *node_info[1])
for node_info in zip(range(len(self.X_)), self.X_)]
self.edges_ = list(self._generate_edges(self.X_))
self.X_, self.edges_ = self._graph_data_creation(X)
return self

def fit_transform(self, X, y=None, **fit_params):
Expand Down Expand Up @@ -113,11 +109,7 @@ def fit_transform(self, X, y=None, **fit_params):
"""
# TODO: Include a validation step for X
_X = reduce(iconcat, X, [])
# Preprocess X by 1) flattening and 2) extending each tuple
_X = [(node_info[0], *node_info[1])
for node_info in zip(range(len(_X)), _X)]
_edges = self._generate_edges(_X)
_X, _edges = self._graph_data_creation(X)

# Graph construction
graph = ig.Graph()
Expand All @@ -132,6 +124,14 @@ def fit_transform(self, X, y=None, **fit_params):
zip(*_X)))
return graph

def _graph_data_creation(self, X):
X_ = reduce(iconcat, X, [])
# Preprocess X by 1) flattening and 2) extending each tuple
X_ = [(node_info[0], *node_info[1])
for node_info in zip(range(len(X_)), X_)]
edges_ = self._generate_edges(X_)
return X_, edges_

@staticmethod
def _pairwise_intersections(min_intersection, node_pair):
data = dict()
Expand Down
31 changes: 16 additions & 15 deletions giotto/mapper/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
global_pipeline_params = ('memory', 'verbose')
nodes_params = ('scaler', 'filter_func', 'cover')
clust_prepr_params = ('clustering_preprocessing',)
clust_params = ('clusterer', 'parallel_clustering_n_jobs',
'parallel_clustering_prefer')
clust_params = ('clusterer', 'n_jobs',
'parallel_backend_prefer')
nerve_params = ('min_intersection',)
clust_prepr_params_prefix = 'pullback_cover__'
nodes_params_prefix = 'pullback_cover__map_and_cover__'
Expand Down Expand Up @@ -145,8 +145,8 @@ def make_mapper_pipeline(scaler=None,
cover=None,
clustering_preprocessing=None,
clusterer=None,
parallel_clustering_n_jobs=None,
parallel_clustering_prefer='threads',
n_jobs=None,
parallel_backend_prefer='threads',
graph_step=True,
min_intersection=1,
memory=None,
Expand Down Expand Up @@ -186,18 +186,19 @@ def make_mapper_pipeline(scaler=None,
Clustering object. ``None`` means using DBSCAN
(:meth:`sklearn.cluster.DBSCAN`) with its default parameters.
parallel_clustering_n_jobs : int or None, optional, default: ``None``
n_jobs : int or None, optional, default: ``None``
The number of jobs to use in a joblib-parallel application of the
clustering step across pullback cover sets. ``None`` means 1 unless
clustering step across pullback cover sets. To be used in
conjunction with `parallel_backend_prefer`. ``None`` means 1 unless
in a :obj:`joblib.parallel_backend` context. ``-1`` means using all
processors.
parallel_clustering_prefer : ``'processes'`` | ``'threads'``, optional, \
parallel_backend_prefer : ``'processes'`` | ``'threads'``, optional, \
default: ``'threads'``
Selects the default joblib backend to use in a joblib-parallel
application of the clustering step across pullback cover sets.
The default process-based backend is 'loky' and the default
thread-based backend is 'threading'. See [2]_.
Soft hint for the default joblib backend to use in a joblib-parallel
application of the clustering step across pullback cover sets. To be
used in conjunction with `n_jobs`. The default process-based backend is
'loky' and the default thread-based backend is 'threading'. See [2]_.
graph_step : bool, optional, default: ``True``
Whether the resulting pipeline should stop at the calculation of the
Expand Down Expand Up @@ -287,7 +288,7 @@ def make_mapper_pipeline(scaler=None,
>>> # clustering across the pullback cover sets can be beneficial
>>> from sklearn.cluster import DBSCAN
>>> mapper = make_mapper_pipeline(clusterer=DBSCAN(),
... parallel_clustering_n_jobs=6,
... n_jobs=6,
... memory=mkdtemp(),
... verbose=True)
>>> X = np.random.random((100000, 4))
Expand All @@ -298,7 +299,7 @@ def make_mapper_pipeline(scaler=None,
[Pipeline] .... (step 1 of 3) Processing pullback_cover, total= 0.7s
[Pipeline] ........ (step 2 of 3) Processing clustering, total= 1.9s
[Pipeline] ............. (step 3 of 3) Processing nerve, total= 0.3s
>>> mapper.set_params(parallel_clustering_n_jobs=1)
>>> mapper.set_params(n_jobs=1)
>>> mapper.fit_transform(X)
[Pipeline] ........ (step 2 of 3) Processing clustering, total= 5.3s
[Pipeline] ............. (step 3 of 3) Processing nerve, total= 0.3s
Expand Down Expand Up @@ -366,8 +367,8 @@ def make_mapper_pipeline(scaler=None,
('map_and_cover', map_and_cover)])),
('clustering', ParallelClustering(
clusterer=_clusterer,
parallel_clustering_n_jobs=parallel_clustering_n_jobs,
parallel_clustering_prefer=parallel_clustering_prefer))
n_jobs=n_jobs,
parallel_backend_prefer=parallel_backend_prefer))
]

if graph_step:
Expand Down

0 comments on commit 5f9cb1d

Please sign in to comment.