From c2cdd6d3ae32f805c156d41190ac38f486ffd4a6 Mon Sep 17 00:00:00 2001
From: DOHMATOB Elvis <elvis.dohmatob@inria.fr>
Date: Fri, 19 May 2017 17:52:47 +0200
Subject: [PATCH] ENH: implemented social sparsity shrinkage

---
 modl/datasets/fmri.py          |  14 ++-
 modl/datasets/hcp.py           |   4 +-
 modl/dict_fact.py              |  80 ++++++++++------
 modl/fmri.py                   |  75 +++++++++------
 modl/social_sparsity.py        | 168 ++++++++++++++++++++++++---------
 modl/structured_dict_update.py |   2 +
 6 files changed, 235 insertions(+), 108 deletions(-)

diff --git a/modl/datasets/fmri.py b/modl/datasets/fmri.py
index c82d2bd..00ced12 100644
--- a/modl/datasets/fmri.py
+++ b/modl/datasets/fmri.py
@@ -1,5 +1,6 @@
+from collections import OrderedDict
 import json
-from os.path import join
+from os.path import join, sep
 
 from nilearn import datasets
 from nilearn.datasets import fetch_atlas_smith_2009
@@ -27,7 +28,14 @@ def load_rest_func(data_dir=None, dataset='adhd', n_subjects=40,
             except IOError:  # FileNotFoundError
                 raise ValueError(
                     'Please unmask the data using hcp_prepare.py first.')
-            data = sorted(list(mapping.values()))
+            data = mapping.values()
+            data_ = OrderedDict()
+            for masked in data:
+                subject_id = masked.split(sep)[-5]
+                if subject_id not in data_:
+                    data_[subject_id] = []
+                data_[subject_id].append(masked)
+            data = data_.values()
         else:
             hcp_dataset = fetch_hcp_rest(data_dir=data_dir,
                                          n_subjects=n_subjects)
@@ -35,7 +43,7 @@ def load_rest_func(data_dir=None, dataset='adhd', n_subjects=40,
             # list of 4D nifti files for each subject
             data = hcp_dataset.func
             # Flatten it
-            data = [(record for record in subject) for subject in data]
+            data = [[record for record in subject] for subject in data]
     else:
         raise NotImplementedError
     train_data, test_data = train_test_split(data,
diff --git a/modl/datasets/hcp.py b/modl/datasets/hcp.py
index f9737e5..2cf704d 100644
--- a/modl/datasets/hcp.py
+++ b/modl/datasets/hcp.py
@@ -285,7 +285,7 @@ def fetch_hcp_rest(data_dir=None, n_subjects=500):
     if head == "HCP":
         subject_pattern = "*/*"
     elif head == "HCP900":
-        subject_pattern == "*"
+        subject_pattern = "*"
     else:
         raise ValueError
     list_dir = sorted(glob.glob(join(source_dir, subject_pattern,
@@ -321,7 +321,7 @@ def fetch_hcp_rest(data_dir=None, n_subjects=500):
             func.append(subject_func)
 
     results = {'func': func, 'meta': meta,
-               'mask': mask,
+               'mask': mask_img,
                'description': "'Human connectome project"}
     return Bunch(**results)
 
diff --git a/modl/dict_fact.py b/modl/dict_fact.py
index acebae8..ade48de 100644
--- a/modl/dict_fact.py
+++ b/modl/dict_fact.py
@@ -55,6 +55,7 @@ def __init__(self,
                  max_iter=100,
                  rand_size=True,
                  replacement=True,
+                 bcd_n_iter=1,
                  atomic_prox=None,
                  mask=None
                  ):
@@ -187,6 +188,7 @@ def __init__(self,
         self.rand_size = rand_size
         self.replacement = replacement
 
+        self.bcd_n_iter = bcd_n_iter
         self.atomic_prox = atomic_prox
         self.mask = mask
 
@@ -424,7 +426,7 @@ def transform(self, X):
                 self.code_l1_ratio, self.code_alpha, self.code_pos,
                 self.tol, self.max_iter)
             res = self.pool_.map(par_func, batches)
-            _ = list(res)
+            list(res)
         else:
             _enet_regression_single_gram(
                 G, Dx, X, code, sample_indices,
@@ -599,7 +601,7 @@ def _compute_code(self, X, sample_indices,
                     self.code_l1_ratio, self.code_alpha, self.code_pos,
                     self.tol, self.max_iter)
 
-    def _update_dict(self, subset):
+    def _update_dict(self, subset=None):
         """Dictionary update part
 
         Parameters
@@ -612,41 +614,61 @@ def _update_dict(self, subset):
                                                       self.components_))
         len_subset = subset.shape[0]
         n_components, n_features = self.components_.shape
-        components_subset = self.components_[:, subset]
-        atom_temp = np.zeros_like(components_subset[0])
+        components_subset = self.components_
         gradient_subset = self.gradient_[:, subset]
+        if subset is not None:
+            components_subset = components_subset[:, subset]
+            gradient_subset = gradient_subset[:, subset]
+        atom_temp = np.zeros_like(components_subset[0])
 
         if self.G_agg == 'full' and len_subset < n_features / 2.:
             self.G_ -= components_subset.dot(components_subset.T)
 
         gradient_subset -= self.C_.dot(components_subset)
 
-        for idx, k in enumerate(self.random_state.permutation(n_components)):
-            gradient_subset = ger(1., self.C_[k], components_subset[k],
-                           a=gradient_subset, overwrite_a=True)
-            if self.C_[k, k] > 1e-20:
-                components_subset[k] = gradient_subset[k] / self.C_[k, k]
-                if self.atomic_prox is not None:
-                    components_subset[k] = _atomic_prox(
-                        components_subset[k], self.C_[k, k], mask=self.mask,
-                        which=self.atomic_prox, alpha=self.comp_alpha,
-                        l1_ratio=self.comp_l1_ratio, verbose=self.verbose,
-                        counter=self.n_iter_, idx=idx)
+        delta_dict = np.inf
+        for bcd_iter in range(self.bcd_n_iter):
+            if self.verbose:
+                print("[BCD] iter %02i/%02i" % (bcd_iter + 1, self.bcd_n_iter))
+            print("\t ||D - D_old||_F / ||D_old||_F = %g" % delta_dict)
+            old_dict = components_subset.copy()
+            order = self.random_state.permutation(n_components)
+            for idx, k in enumerate(order):
+                gradient_subset = ger(1., self.C_[k], components_subset[k],
+                               a=gradient_subset, overwrite_a=True)
+                if self.C_[k, k] > 1e-20:
+                    components_subset[k] = gradient_subset[k] / self.C_[k, k]
+                    if self.atomic_prox is not None:
+                        components_subset[k] = _atomic_prox(
+                            components_subset[k], self.C_[k, k],
+                            mask=self.mask, which=self.atomic_prox,
+                            alpha=self.comp_alpha, l1_ratio=self.comp_l1_ratio,
+                            counter=self.n_iter_, verbose=self.verbose,
+                            idx=idx)
+                    else:
+                        components_subset[k] = enet_projection(
+                            components_subset[k], atom_temp, 1.,
+                            self.comp_l1_ratio)
+                        components_subset[k] = atom_temp
+                    gradient_subset = ger(
+                        -1., self.C_[k], components_subset[k],
+                        a=gradient_subset, overwrite_a=True)
                 else:
-                    components_subset[k] = enet_projection(
-                        components_subset[k], atom_temp, 1.,
-                        self.comp_l1_ratio)
-                gradient_subset = ger(-1., self.C_[k], components_subset[k],
-                                      a=gradient_subset, overwrite_a=True)
-            else:
-                if self.verbose == 1:
-                    sys.stdout.write("+")
-                    sys.stdout.flush()
-                elif self.verbose:
-                    print("Adding new random atom")
-                components_subset[k] = self.random_state.randn(n_features)
-
-        self.components_[:, subset] = components_subset
+                    if self.verbose == 1:
+                        sys.stdout.write("+")
+                        sys.stdout.flush()
+                    elif self.verbose:
+                        print("Adding new random atom")
+                    components_subset[k] = self.random_state.randn(n_features)
+
+            delta_dict = np.sum((components_subset - old_dict) ** 2)
+            delta_dict /= np.sum(old_dict ** 2)
+            delta_dict = np.sqrt(delta_dict)
+
+        if subset is None:
+            self.components_ = components_subset
+        else:
+            self.components_[:, subset] = components_subset
 
         if self.G_agg == 'full':
             if len_subset < n_features / 2.:
diff --git a/modl/fmri.py b/modl/fmri.py
index 838a2bc..cc6f731 100644
--- a/modl/fmri.py
+++ b/modl/fmri.py
@@ -163,6 +163,7 @@ def __init__(self,
                  buffer_size=None,
                  n_jobs=1, verbose=0,
                  callback=None,
+                 bcd_n_iter=1,
                  atomic_prox=None):
         BaseDecomposition.__init__(self, n_components=n_components,
                                    random_state=random_state,
@@ -188,6 +189,7 @@ def __init__(self,
         self.n_epochs = n_epochs
         self.batch_size = batch_size
         self.reduction = reduction
+        self.bcd_n_iter = bcd_n_iter
 
         self.method = method
 
@@ -198,24 +200,7 @@ def __init__(self,
 
         self.atomic_prox = atomic_prox
 
-    def fit(self, imgs=None, y=None, confounds=None, raw=False):
-        """Compute the mask and the dictionary maps across subjects
-
-        Parameters
-        ----------
-        imgs: list of Niimg-like objects
-            See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
-            Data on which PCA must be calculated. If this is a list,
-            the affine is considered the same for all.
-
-        confounds: CSV file path or 2D matrix
-            This parameter is passed to nilearn.signal.clean. Please see the
-            related documentation for details
-
-        Returns
-        -------
-        self
-        """
+    def _prepare(self, imgs=None, y=None, confounds=None, raw=False):
         if imgs is None or self.n_epochs == 0:
             # Will raise error is mask has not been provided
             if self.mask is None:
@@ -260,6 +245,7 @@ def fit(self, imgs=None, y=None, confounds=None, raw=False):
                     random_state=self.random_state,
                     n_threads=self.n_jobs,
                     verbose=0,
+                    bcd_n_iter=self.bcd_n_iter,
                     atomic_prox=self.atomic_prox, mask=mask)
                 self.dict_fact_.prepare(n_samples=n_samples,
                                         n_features=n_voxels,
@@ -286,8 +272,6 @@ def fit(self, imgs=None, y=None, confounds=None, raw=False):
         G_agg = method['G_agg']
         Dx_agg = method['Dx_agg']
 
-        n_records = len(imgs)
-
         if self.verbose:
             print("Preloading data")
 
@@ -315,7 +299,12 @@ def fit(self, imgs=None, y=None, confounds=None, raw=False):
         n_samples = indices_list[-1] + 1
 
         n_voxels = np.sum(check_niimg(self.masker_.mask_img_).get_data() != 0)
-        if self.dict_init is not None:
+        if hasattr(self, "components_") and self.components_ is not None:
+            dict_init = self.components_
+            if dict_init is None:
+                n_components = self.n_com
+            n_components = len(self.components_)
+        elif self.dict_init is not None:
             if self.verbose:
                 print("Preloading initial dictionary")
             if isinstance(self.dict_init, _basestring) and \
@@ -342,6 +331,7 @@ def fit(self, imgs=None, y=None, confounds=None, raw=False):
             elif self.mask_img is not None:
                 mask = self.mask_img_.get_data().astype(np.bool)
 
+        self.components_ = dict_init
         self.dict_fact_ = DictFact(n_components=n_components,
                                    code_alpha=self.alpha,
                                    code_l1_ratio=self.code_l1_ratio,
@@ -355,11 +345,34 @@ def fit(self, imgs=None, y=None, confounds=None, raw=False):
                                    random_state=self.random_state,
                                    n_threads=self.n_jobs,
                                    verbose=self.verbose,
+                                   bcd_n_iter=self.bcd_n_iter,
                                    atomic_prox=self.atomic_prox,
                                    mask=mask)
         self.dict_fact_.prepare(n_samples=n_samples, n_features=n_voxels,
                                 X=dict_init, dtype=dtype)
+        return data_list, indices_list, shelving
+
+    def fit(self, imgs=None, y=None, confounds=None, raw=False):
+        """Compute the mask and the dictionary maps across subjects
 
+        Parameters
+        ----------
+        imgs: list of Niimg-like objects
+            See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg.
+            Data on which PCA must be calculated. If this is a list,
+            the affine is considered the same for all.
+
+        confounds: CSV file path or 2D matrix
+            This parameter is passed to nilearn.signal.clean. Please see the
+            related documentation for details
+
+        Returns
+        -------
+        self
+        """
+        data_list, indices_list, shelving = self._prepare(
+            imgs=imgs, y=y, confounds=confounds, raw=raw)
+        n_records = len(imgs)
         current_n_records = 0
         for i in range(self.n_epochs):
             if self.verbose:
@@ -372,10 +385,7 @@ def fit(self, imgs=None, y=None, confounds=None, raw=False):
                 data = data[::self.reduction]
                 sample_indices = np.arange(indices_list[record],
                                            indices_list[record + 1])
-                if raw:
-                    if isinstance(img, basestring):
-                        data = np.load(data, mmap_mode="r")
-                else:
+                if not raw:
                     if shelving:
                         data = data.get()
                     else:
@@ -443,7 +453,7 @@ def score(self, imgs, confounds=None):
         score /= len(data_list)
         return score
 
-    def transform(self, imgs, confounds=None):
+    def transform(self, imgs, confounds=None, raw=False):
         """Compute the mask and the ICA maps across subjects
 
         Parameters
@@ -464,10 +474,12 @@ def transform(self, imgs, confounds=None):
         """
         if not isinstance(imgs, (list, tuple)):
             imgs = [imgs]
-        raw = isinstance(imgs[0], np.ndarray)
-
-        shelving = self.masker_._shelving
+        raw = raw or isinstance(imgs[0], np.ndarray)
 
+        if hasattr(self, "masker_"):
+            shelving = self.masker_._shelving
+        else:
+            shelving = False
         codes = []
         if raw:
             data_list = imgs
@@ -475,6 +487,11 @@ def transform(self, imgs, confounds=None):
             if confounds is None:
                 confounds = itertools.repeat(None)
             data_list = self.masker_.transform(imgs, confounds)  # shelved
+
+        # XXX using components which are not flipped produces garbage
+        # predictions!
+        self.dict_fact_.components_ = self.components_
+
         for idx, data in enumerate(data_list):
             if raw:
                 if isinstance(data, _basestring):
diff --git a/modl/social_sparsity.py b/modl/social_sparsity.py
index f93a225..3d97202 100644
--- a/modl/social_sparsity.py
+++ b/modl/social_sparsity.py
@@ -6,8 +6,10 @@
 # License: simplified BSD
 
 from math import sqrt
+from numbers import Number
 
 import numpy as np
+from scipy import ndimage
 
 from nilearn.decoding.fista import _check_lipschitz_continuous
 from nilearn.decoding.objective_functions import (_unmask, _logistic_loss_grad,
@@ -127,12 +129,11 @@ def fista(f1_grad, f2_prox, lipschitz_constant, w_size,
             z = w + ((t0 - 1.) / t) * (w - w_old)
         w_old[:] = w
 
-
     init = dict(w=w.copy(), z=z, t=t, stepsize=stepsize)
     return w, history, init
 
 
-def _neighboorhood_norm(img, side_weights=.7):
+def _neighboorhood_norms_squared(img, side_weights=.7):
     " Return the squared norm averaged on 3D 6-neighboorhood"
     # Our stride tricks only work on C-contiguous arrays
 
@@ -159,56 +160,135 @@ def _neighboorhood_norm(img, side_weights=.7):
     # Now the edges (only 3% of the time is spent below for large arrays)
     # For 5 neighboors (6 faces of a cube)
     factor = (1 + 5 * side_weights)
-    grp_norms[   0, 1:-1, 1:-1] /= factor
-    grp_norms[  -1, 1:-1, 1:-1] /= factor
-    grp_norms[1:-1,    0, 1:-1] /= factor
-    grp_norms[1:-1,   -1, 1:-1] /= factor
-    grp_norms[1:-1, 1:-1,    0] /= factor
-    grp_norms[1:-1, 1:-1,   -1] /= factor
+    grp_norms[0, 1:-1, 1:-1] /= factor
+    grp_norms[-1, 1:-1, 1:-1] /= factor
+    grp_norms[1:-1, 0, 1:-1] /= factor
+    grp_norms[1:-1, -1, 1:-1] /= factor
+    grp_norms[1:-1, 1:-1, 0] /= factor
+    grp_norms[1:-1, 1:-1, -1] /= factor
+
     # For 4 neighboors (12 edges of a cube)
     factor = (1 + 4 * side_weights)
-    grp_norms[   0,    0, 1:-1] /= factor
-    grp_norms[   0,   -1, 1:-1] /= factor
-    grp_norms[  -1,    0, 1:-1] /= factor
-    grp_norms[  -1,   -1, 1:-1] /= factor
-    grp_norms[1:-1,    0,    0] /= factor
-    grp_norms[1:-1,    0,   -1] /= factor
-    grp_norms[1:-1,   -1,    0] /= factor
-    grp_norms[1:-1,   -1,   -1] /= factor
-    grp_norms[   0, 1:-1,    0] /= factor
-    grp_norms[   0, 1:-1,   -1] /= factor
-    grp_norms[  -1, 1:-1,    0] /= factor
-    grp_norms[  -1, 1:-1,   -1] /= factor
+    grp_norms[0, 0, 1:-1] /= factor
+    grp_norms[0, -1, 1:-1] /= factor
+    grp_norms[-1, 0, 1:-1] /= factor
+    grp_norms[-1, -1, 1:-1] /= factor
+    grp_norms[1:-1, 0, 0] /= factor
+    grp_norms[1:-1, 0, -1] /= factor
+    grp_norms[1:-1, -1, 0] /= factor
+    grp_norms[1:-1, -1, -1] /= factor
+    grp_norms[0, 1:-1, 0] /= factor
+    grp_norms[0, 1:-1, -1] /= factor
+    grp_norms[-1, 1:-1, 0] /= factor
+    grp_norms[-1, 1:-1, -1] /= factor
+
     # For 3 neighboors (8 vertices of a cube)
     factor = (1 + 3 * side_weights)
-    grp_norms[ 0,  0,  0] /= factor
-    grp_norms[ 0, -1,  0] /= factor
-    grp_norms[-1,  0,  0] /= factor
-    grp_norms[-1, -1,  0] /= factor
-    grp_norms[ 0,  0, -1] /= factor
-    grp_norms[ 0, -1, -1] /= factor
-    grp_norms[-1,  0, -1] /= factor
+    grp_norms[0, 0, 0] /= factor
+    grp_norms[0, -1, 0] /= factor
+    grp_norms[-1, 0, 0] /= factor
+    grp_norms[-1, -1, 0] /= factor
+    grp_norms[0, 0, -1] /= factor
+    grp_norms[0, -1, -1] /= factor
+    grp_norms[-1, 0, -1] /= factor
     grp_norms[-1, -1, -1] /= factor
 
     return grp_norms
 
 
-def _prox_social_sparsity(img, alpha, side_weights=.7):
-    """Social sparsity on 3x3x3 groups, as defined by eq 4 of Kowalski et
-    al, 'Social Sparsity...'"""
-    grp_norms = _neighboorhood_norm(img, side_weights=side_weights)
+def _prox_l21(img, alpha, grp_norms_squared=None):
     # To avoid side effects, assign the raw img values on the side
-    grp_norms = np.sqrt(grp_norms, out=grp_norms)
+    if grp_norms_squared is None:
+        grp_norms_squared = img ** 2
+    grp_norms = np.sqrt(grp_norms_squared, out=grp_norms_squared)
     shrink = np.zeros(img.shape)
     img_nz = (grp_norms > 1e-10)
     shrink[img_nz] = (1 - alpha / grp_norms[img_nz]).clip(0)
-
     return img * shrink
 
 
+def _prox_social_sparsity(img, alpha, side_weights=.7, use_scipy=False,
+                          radius=1):
+    """Social sparsity on radius x radius x radius groups, as defined
+    by eq 4 of Kowalski et al, 'Social Sparsity...'
+
+    Parameters
+    ----------
+    side_weight: nonnegative float, optional (default 0.7)
+        Weights of sides of neigborhood relative to center. A value of 1
+        corresponds to the classical overlapping group-Lasso shrinkage
+        operator.
+
+    radius: int, optional (default 1)
+        Radius of neighbours. This is a field-of-view (FOV) parameter and
+        plays a rule similar to the fwhm in Gaussian kernels. The larger
+        the radius, the smoother the prox.
+
+    use_scipy: boolean, optional (default False)
+        If True, then we'll use scipy's ndimage.filters.convolve
+        for the smoothing with an appropriate rectangular local kernel.
+
+    Notes
+    -----
+    For the moment, only the use_scipy=True option supports the radius
+    parameter.
+    """
+    if use_scipy:
+        if isinstance(radius, Number):
+            radius = (radius,) * img.ndim
+        radius = np.asanyarray(radius)
+        diameter = 2 * radius + 1
+        grp_norms_squared = img ** 2
+        if side_weights == 1.:
+            # use scipy's optimized rectangular uniform filter
+            ndimage.uniform_filter(grp_norms_squared, size=diameter,
+                                   output=grp_norms_squared,
+                                   mode="constant")
+        else:
+            social_filter = np.full(diameter, 1.)
+            social_filter *= side_weights
+
+            # adjust weight at center of filter
+            if img.ndim == 1:
+                social_filter[radius] = 1.
+            elif img.ndim == 2:
+                social_filter[radius, radius] = 1.
+            elif img.ndim == 3:
+                social_filter[radius, radius, radius] = 1.
+            else:
+                raise RuntimeError("WTF! img.ndim is %i." % img.ndim)
+
+            # normalize filter weights to sum to 1
+            social_filter /= social_filter.sum()
+
+            # the actual convolution
+            ndimage.filters.convolve(grp_norms_squared, social_filter,
+                                     output=grp_norms_squared, mode="constant")
+    else:
+        # use ninja code from @gael
+        grp_norms_squared = _neighboorhood_norms_squared(
+            img, side_weights=side_weights)
+    return _prox_l21(img, alpha, grp_norms_squared=grp_norms_squared)
+
+
+def _social_sparsity_alpha_grid(XTy, mask, side_weights=.7, n_alphas=10,
+                                eps=1e-3):
+    imgs = np.zeros((len(XTy),) + mask.shape, dtype=XTy.dtype)
+    imgs[:, mask] = XTy
+    grp_norms_squared = [_neighboorhood_norms_squared(
+        img, side_weights=side_weights) for img in imgs]
+    alpha_max = np.sqrt(np.max(grp_norms_squared))
+
+    if n_alphas == 1:
+        return np.array([alpha_max])
+    alpha_min = alpha_max * eps
+    return np.logspace(np.log10(alpha_min), np.log10(alpha_max),
+                       num=n_alphas)[::-1]
+
+
 def social_solver(X, y, alpha, mask, loss=None, max_iter=100,
-                lipschitz_constant=None, init=None,
-                tol=1e-4, callback=None, verbose=1):
+                lipschitz_constant=None, init=None, tol=1e-4,
+                  callback=None, verbose=1, **kwargs):
     """Solver for social-sparsity models.
 
     Can handle least squares (mean squared error --a.k.a mse) or logistic
@@ -288,13 +368,13 @@ def f1_grad(w):
     if loss == "mse":
         def f2_prox(w, stepsize):
             out = _prox_social_sparsity(_unmask(w, mask),
-                                        alpha * stepsize)
+                                        alpha * stepsize, **kwargs)
             return out[mask]
     else:
         # Deal with intercept
         def f2_prox(w, stepsize):
             out = _prox_social_sparsity(_unmask(w[:-1], mask),
-                                        alpha * stepsize)
+                                        alpha * stepsize, **kwargs)
             return np.append(out[mask], w[-1])
 
     # invoke FISTA solver
@@ -306,12 +386,10 @@ def f2_prox(w, stepsize):
     return w, obj, init
 
 
-def social_solver_with_l1(X, y, alpha, l1_ratio, mask,
-                loss=None, max_iter=100,
-                lipschitz_constant=None, init=None,
-                tol=1e-4, callback=None, verbose=1):
+def social_solver_with_l1(X, y, alpha, l1_ratio, mask, loss=None, max_iter=100,
+                          lipschitz_constant=None, init=None, tol=1e-4,
+                          callback=None, verbose=1, **kwargs):
     # Hack to plug social-sparsity in SpaceNet
-    return social_solver(X, y, alpha, mask,
-                loss=loss, max_iter=max_iter,
-                lipschitz_constant=lipschitz_constant, init=init,
-                tol=tol, callback=callback, verbose=verbose)
+    return social_solver(X, y, alpha, mask, loss=loss, max_iter=max_iter,
+                         lipschitz_constant=lipschitz_constant, init=init,
+                         tol=tol, callback=callback, verbose=verbose, **kwargs)
diff --git a/modl/structured_dict_update.py b/modl/structured_dict_update.py
index 27c549f..7180275 100644
--- a/modl/structured_dict_update.py
+++ b/modl/structured_dict_update.py
@@ -94,6 +94,8 @@ def _unmask(v):
     weight = alpha
     weight /= a_kk
     weight *= counter
+    if counter < 1.:
+        raise ValueError("Looks like a bug!")
     if which == "classical":
         from .utils.math.enet import enet_projection
         atom_temp = np.zeros_like(atom)