From c4b123d1055a1916fcfa87e8b1ca1f128cad2704 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Thu, 25 Nov 2021 14:59:52 +0000
Subject: [PATCH 01/21] conftest: minor fixture changes

---
 tests/conftest.py | 53 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index a35954d7..ec872834 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 import os
+from typing import List
 
 import pandas as pd
 import pytest
@@ -9,7 +10,7 @@
 import navis
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def data_dir():
     return Path(__file__).resolve().parent.parent / "navis" / "data"
 
@@ -17,8 +18,8 @@ def data_dir():
 @pytest.fixture(
     params=["Path", "pathstr", "swcstr", "textbuffer", "rawbuffer", "DataFrame"]
 )
-def swc_source(request, data_dir: Path):
-    swc_path: Path = data_dir / "swc" / "722817260.swc"
+def swc_source(request, swc_paths: List[Path]):
+    swc_path: Path = swc_paths[0]
     if request.param == "Path":
         yield swc_path
     elif request.param == "pathstr":
@@ -42,9 +43,9 @@ def swc_source(request, data_dir: Path):
 @pytest.fixture(
     params=["dirstr", "dirpath", "list", "listwithdir"],
 )
-def swc_source_multi(request, data_dir: Path):
-    dpath = data_dir / "swc"
-    fpath = dpath / "722817260.swc"
+def swc_source_multi(request, swc_paths: List[Path]):
+    fpath = swc_paths[0]
+    dpath = fpath.parent
     if request.param == "dirstr":
         yield str(dpath)
     elif request.param == "dirpath":
@@ -74,3 +75,43 @@ def voxel_nrrd_path(tmp_path):
     nrrd.write(os.fspath(path), data, header)
 
     return path
+
+
+def data_paths(dpath, glob="*"):
+    return sorted(dpath.glob(glob))
+
+
+@pytest.fixture(scope="session")
+def swc_paths(data_dir: Path):
+    return data_paths(data_dir / "swc", "*.swc")
+
+
+@pytest.fixture(scope="session")
+def gml_paths(data_dir: Path):
+    return data_paths(data_dir / "gml", "*.gml")
+
+
+@pytest.fixture(scope="session")
+def obj_paths(data_dir: Path):
+    return data_paths(data_dir / "obj", "*.obj")
+
+
+@pytest.fixture(scope="session")
+def synapses_paths(data_dir: Path):
+    return data_paths(data_dir / "synapses", "*.csv")
+
+
+@pytest.fixture(scope="session")
+def volumes_paths(data_dir: Path):
+    return data_paths(data_dir / "volumes", "*.obj")
+
+
+@pytest.fixture
+def treeneuron_dfs(swc_paths, synapses_paths):
+    swc_reader = navis.io.swc_io.SwcReader()
+    out = []
+    for swc_path, syn_path in zip(swc_paths, synapses_paths):
+        neuron = swc_reader.read_file_path(swc_path)
+        neuron.connectors = pd.read_csv(syn_path)
+        out.append(neuron)
+    return out

From 30abb72c96f97d69972e81d9ebfe65b3c7205d7b Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Thu, 25 Nov 2021 15:00:15 +0000
Subject: [PATCH 02/21] Dotprops.__len__

---
 navis/core/dotprop.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/navis/core/dotprop.py b/navis/core/dotprop.py
index c83b04f9..c4e5c64e 100644
--- a/navis/core/dotprop.py
+++ b/navis/core/dotprop.py
@@ -622,3 +622,6 @@ def to_skeleton(self,
         tn._soma = self._soma
 
         return tn
+
+    def __len__(self):
+        return len(self.points)

From 35281046df2360fb0765c39c0bf07d52a8463ab2 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Thu, 25 Nov 2021 15:00:37 +0000
Subject: [PATCH 03/21] Initial implementation of smat builder

---
 navis/nbl/smat.py           | 539 ++++++++++++++++++++++++++++++++++++
 tests/test_nbl/__init__.py  |   0
 tests/test_nbl/test_smat.py | 161 +++++++++++
 3 files changed, 700 insertions(+)
 create mode 100644 navis/nbl/smat.py
 create mode 100644 tests/test_nbl/__init__.py
 create mode 100644 tests/test_nbl/test_smat.py

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
new file mode 100644
index 00000000..01dc6f2a
--- /dev/null
+++ b/navis/nbl/smat.py
@@ -0,0 +1,539 @@
+from __future__ import annotations
+from itertools import permutations
+import sys
+import os
+from collections import Counter
+from concurrent.futures import ProcessPoolExecutor
+from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple
+import logging
+from pathlib import Path
+from functools import lru_cache
+from copy import deepcopy
+import operator
+
+import numpy as np
+import pandas as pd
+
+from ..core.neurons import Dotprops
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_SEED = 1991
+
+epsilon = sys.float_info.epsilon
+cpu_count = max(1, (os.cpu_count() or 2) - 1)
+IMPLICIT_INTERVAL = "[)"
+
+fp = Path(__file__).resolve().parent
+smat_path = fp / 'score_mats'
+
+
+def chunksize(it_len, cpu_count, min_chunk=50):
+    return max(min_chunk, int(it_len / (cpu_count * 4)))
+
+
+def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf):
+    """Ensures that boundaries cover -inf to inf.
+    If the left or rightmost values lie within the interval ``(left, right)``,
+    ``-inf`` or ``inf`` will be prepended or appended respectively.
+    Otherwise, the left and right values will be set to
+    ``-inf`` and ``inf`` respectively.
+    Parameters
+    ----------
+    arr : array-like of floats
+        Array of boundaries
+    left : float, optional
+        If the first value is greater than this, prepend -inf;
+        otherwise replace that value with -inf.
+        Defaults to -inf if None.
+    right : float, optional
+        If the last value is less than this, append inf;
+        otherwise replace that value with inf.
+        Defaults to inf if None.
+    Returns
+    -------
+    np.ndarray of floats
+        Boundaries from -inf to inf
+    Raises
+    ------
+    ValueError
+        If values of ``arr`` are not monotonically increasing.
+    """
+    if left is None:
+        left = -np.inf
+    if right is None:
+        right = np.inf
+
+    if not np.all(np.diff(arr) > 0):
+        raise ValueError("Boundaries are not monotonically increasing")
+
+    items = list(arr)
+    if items[0] <= left:
+        items[0] = -np.inf
+    else:
+        items.insert(0, -np.inf)
+
+    if items[-1] >= right:
+        items[-1] = np.inf
+    else:
+        items.append(np.inf)
+
+    return np.array(items)
+
+
+def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any]]:
+    for a, b in pairs:
+        if a != b:
+            yield a, b
+
+
+class LookupNdBuilder:
+    def __init__(
+        self,
+        dotprops,
+        matching_sets,
+        boundaries: Sequence[Sequence[float]],
+        match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]],
+        nonmatching=None,
+        seed=DEFAULT_SEED,
+    ) -> None:
+        f"""Class for building an N-dimensional score lookup for NBLAST.
+        Parameters
+        ----------
+        dotprops : dict or list of Dotprops
+            An indexable sequence of all neurons which will be used as the training set,
+            as Dotprops objects.
+        matching_sets : list of sets of int
+            Sets of neurons, as indices into ``dotprops``, which should be considered matches.
+        boundaries : sequence of array-likes of floats
+            List of lists, where the inner lists are monotonically increasing
+            from -inf to inf.
+            The length of the outer list is the dimensionality of the lookup table.
+            The inner lists are the boundaries of bins for that dimension,
+            i.e. an inner list of N items describes N-1 bins.
+            If an inner list is not inf-bounded,
+            -inf and inf will be prepended and appended.
+            See the ``wrap_bounds`` convenience function.
+        match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]]
+            Function taking 2 arguments,
+            both instances of ``navis.core.neurons.Dotprops``,
+            and returning a list of 1D ``numpy.ndarray``s of floats.
+            The length of the list must be the same as the length of ``boundaries``.
+            The length of the ``array``s must be the same
+            as the number of points in the first argument.
+            This function returns values describing the quality of
+            point matches from a query to a target neuron.
+        nonmatching : set of int, optional
+            Set of neurons, as indices into ``dotprops``,
+            which should not be considered matches.
+            If not given, all ``dotprops`` will be used
+            (on the assumption that matches are a small subset of possible pairs).
+        seed : int, optional
+            Non-matching pairs are drawn at random using this seed,
+            by default {DEFAULT_SEED}
+        """
+        self.dotprops = dotprops
+        self.matching_sets = matching_sets
+        self._nonmatching = nonmatching
+        self.match_fn = match_fn
+        self.boundaries = [wrap_bounds(b) for b in boundaries]
+
+        self.rng = np.random.default_rng(seed)
+
+    def _dotprop_keys(self):
+        try:
+            return self.dotprops.keys()
+        except AttributeError:
+            return range(len(self.dotprops))
+
+    @property
+    def nonmatching(self):
+        """Indices of nonmatching set of neurons"""
+        if self._nonmatching is None:
+            return set(self._dotprop_keys())
+        return self._nonmatching
+
+    def _yield_matching_pairs(self):
+        for ms in self.matching_sets:
+            yield from yield_not_same(permutations(ms, 2))
+
+    def _yield_nonmatching_pairs(self):
+        # todo: this could be much better, use meshgrid or shuffle index arrays
+        return yield_not_same(permutations(self.nonmatching, 2))
+
+    def _empty_counts(self):
+        shape = [len(b) - 1 for b in self.boundaries]
+        return np.zeros(shape, int)
+
+    def _query_to_idxs(self, q_idx, t_idx, counts=None):
+        response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx])
+        idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)]
+
+        if counts is None:
+            counts = self._empty_counts()
+
+        for idx in zip(*idxs):
+            counts[idx] += 1
+
+        return counts
+
+    def _counts_array(self, idx_pairs, threads=None):
+        counts = self._empty_counts()
+        if threads is None or threads == 0 and cpu_count == 1:
+            for q_idx, t_idx in idx_pairs:
+                counts = self._query_to_idxs(q_idx, t_idx, counts)
+            return counts
+
+        threads = threads or cpu_count
+        idx_pairs = np.asarray(idx_pairs, dtype=int)
+        chunks = chunksize(len(idx_pairs), threads)
+
+        with ProcessPoolExecutor(threads) as exe:
+            for these_counts in exe.map(
+                self._query_to_idxs,
+                idx_pairs[:, 0],
+                idx_pairs[:, 1],
+                chunksize=chunks,
+            ):
+                counts += these_counts
+
+        return counts
+
+    def _pick_nonmatching_pairs(self, n_matching_qual_vals):
+        # pre-calculating which pairs we're going to use,
+        # rather than drawing them as we need them,
+        # means that we can parallelise the later step more effectively.
+        # Slowdowns here are practically meaningless
+        # because of how long distdot calculation will take
+        all_nonmatching_pairs = list(self._yield_nonmatching_pairs())
+        nonmatching_pairs = []
+        n_nonmatching_qual_vals = 0
+        while n_nonmatching_qual_vals < n_matching_qual_vals:
+            idx = self.rng.integers(0, len(all_nonmatching_pairs))
+            nonmatching_pair = all_nonmatching_pairs.pop(idx)
+            nonmatching_pairs.append(nonmatching_pair)
+            n_nonmatching_qual_vals += len(self.dotprops[nonmatching_pair[0]])
+
+        return nonmatching_pairs
+
+    def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
+        matching_pairs = list(set(self._yield_matching_pairs()))
+        # need to know the eventual distdot count
+        # so we know how many non-matching pairs to draw
+        q_idx_count = Counter(p[0] for p in matching_pairs)
+        n_matching_qual_vals = sum(
+            len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items()
+        )
+
+        nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals)
+
+        match_counts = self._counts_array(matching_pairs, threads)
+        nonmatch_counts = self._counts_array(nonmatching_pairs, threads)
+
+        # account for there being different total numbers of match/nonmatch dist dots
+        matching_factor = nonmatch_counts.sum() / match_counts.sum()
+        if np.any(match_counts + nonmatch_counts == 0):
+            logger.warning("Some lookup cells have no data in them")
+
+        cells = np.log2(
+            (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon)
+        )
+
+        return self.boundaries, cells
+
+    def build(self, threads=None) -> LookupNd:
+        """Build the score matrix.
+        All non-identical neuron pairs within all matching sets are selected,
+        and distdots calculated for those pairs.
+        Then, the minimum number of non-matching pairs are randomly drawn
+        so that at least as many distdots can be calculated for non-matching
+        pairs.
+        In each bin of the score matrix, the log2 odds ratio of a distdot
+        in that bin belonging to a match vs. non-match is calculated.
+        Parameters
+        ----------
+        threads : int, optional
+            If None, act in serial.
+            If 0, use cpu_count - 1.
+            Otherwise, use the given value.
+        Returns
+        -------
+        pd.DataFrame
+            Suitable for passing to navis.nbl.ScoringFunction
+        Raises
+        ------
+        ValueError
+            If dist_bins or dot_bins are not set.
+        """
+        return LookupNd(*self._build(threads))
+
+
+def dist_dot(q: Dotprops, t: Dotprops):
+    return list(q.dist_dots(t))
+
+
+def dist_dot_alpha(q: Dotprops, t: Dotprops):
+    dist, dot, alpha = q.dist_dots(t, alpha=True)
+    return [dist, dot * np.sqrt(alpha)]
+
+
+class LookupDistDotBuilder(LookupNdBuilder):
+    def __init__(
+        self,
+        dotprops,
+        matching_sets,
+        dist_boundaries,
+        dot_boundaries,
+        nonmatching=None,
+        use_alpha=False,
+        seed=DEFAULT_SEED,
+    ):
+        f"""Class for building a 2-dimensional score lookup for NBLAST.
+        The scores are
+        1. The distances between best-matching points
+        2. The dot products of direction vectors around those points,
+            optionally scaled by the colinearity ``alpha``.
+        Parameters
+        ----------
+        dotprops : dict or list of Dotprops
+            An indexable sequence of all neurons which will be used as the training set,
+            as Dotprops objects.
+        matching_sets : list of sets of int
+            Sets of neurons, as indices into ``dotprops``, which should be considered matches.
+        dist_boundaries : array-like of floats
+            Monotonically increasing boundaries between distance bins
+            (i.e. N values makes N-1 bins).
+            If the first value > 0, -inf will be prepended,
+            otherwise the first value will be replaced with -inf.
+            inf will be appended unless the last value is already inf.
+            Consider using ``numpy.logspace`` or ``numpy.geomspace``
+            to generate the inner boundaries of the bins.
+        dot_boundaries : array-like of floats
+            Monotonically increasing boundaries between
+            (possibly alpha-scaled) dot product bins
+            (i.e. N values makes N-1 bins).
+            If the first value > 0, -inf will be prepended,
+            otherwise the first value will be replaced with -inf.
+            If the last value < 1, inf will be appended,
+            otherwise the last value will be replaced with inf.
+            Consider using ``numpy.linspace`` between 0 and 1.
+        nonmatching : set of int, optional
+            Set of neurons, as indices into ``dotprops``,
+            which should not be considered matches.
+            If not given, all ``dotprops`` will be used
+            (on the assumption that matches are a small subset of possible pairs).
+        use_alpha : bool, optional
+            If true, multiply the dot product by the geometric mean
+            of the matched points' alpha values
+            (i.e. ``sqrt(alpha1 * alpha2)``).
+        seed : int, optional
+            Non-matching pairs are drawn at random using this seed,
+            by default {DEFAULT_SEED}
+        """
+        match_fn = dist_dot_alpha if use_alpha else dist_dot
+        super().__init__(
+            dotprops,
+            matching_sets,
+            [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)],
+            match_fn,
+            nonmatching,
+            seed,
+        )
+
+    def build(self, threads=None) -> Lookup2d:
+        return Lookup2d(*self._build(threads))
+
+
+class LookupNd:
+    def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray):
+        if [len(b) - 1 for b in boundaries] != list(cells.shape):
+            raise ValueError("boundaries and cells have inconsistent bin counts")
+        self.boundaries = boundaries
+        self.cells = cells
+
+    def __call__(self, *args):
+        if len(args) != len(self.boundaries):
+            raise TypeError(
+                f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given"
+            )
+
+        idxs = tuple(
+            np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries)
+        )
+        out = self.cells[idxs]
+        return out
+
+
+def format_boundaries(arr):
+    return [f"[{lower},{upper})" for lower, upper in zip(arr[:-1], arr[1:])]
+
+
+def parse_boundary(item, strict=False):
+    explicit_interval = item[0] + item[-1]
+    if strict and item[0] + item[-1] != IMPLICIT_INTERVAL:
+        raise ValueError(
+            f"Enclosing characters {explicit_interval} "
+            f"do not match implicit interval specifiers {IMPLICIT_INTERVAL}"
+        )
+    return tuple(float(i) for i in item[1:-1].split(","))
+
+
+def parse_boundaries(items, strict=False):
+    # declaring upper first and then checking for None
+    # pleases the type checker, and handles the case where
+    # len(items) == 0
+    upper = None
+    for item in items:
+        lower, upper = parse_boundary(item, strict)
+        yield lower
+    if upper is None:
+        return
+    yield upper
+
+
+class Lookup2d(LookupNd):
+    """Convenience class inheriting from LookupNd for the common 2D case.
+    Provides IO with pandas DataFrames.
+    """
+
+    def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray):
+        if len(boundaries) != 2:
+            raise ValueError("boundaries must be of length 2; cells must be 2D")
+        super().__init__(boundaries, cells)
+
+    def to_dataframe(self) -> pd.DataFrame:
+        # numpy.digitize includes left, excludes right, i.e. "[left,right)"
+        return pd.DataFrame(
+            self.cells,
+            format_boundaries(self.boundaries[0]),
+            format_boundaries(self.boundaries[1]),
+        )
+
+    @classmethod
+    def from_dataframe(cls, df: pd.DataFrame, strict=False):
+        f"""Parse score matrix from a dataframe with string index and column labels.
+        Expects the index and column labels to specify an interval
+        like ``f"[{{lower}},{{upper}})"``.
+        Will replace the lowermost and uppermost bound with -inf and inf
+        if they are not already.
+        Parameters
+        ----------
+        strict : bool, optional
+            If falsey (default), ignores parentheses and brackets,
+            effectively normalising to
+            ``{IMPLICIT_INTERVAL[0]}lower,upper{IMPLICIT_INTERVAL[1]}``
+            as an implementation detail.
+            Otherwise, raises a ValueError if parens/brackets
+            do not match the implementation.
+        """
+        boundaries = []
+        for arr in (df.index, df.columns):
+            b = np.array(list(parse_boundaries(arr, strict)))
+            b[0] = -np.inf
+            b[-1] = np.inf
+            boundaries.append(b)
+
+        return cls(boundaries, df.to_numpy())
+
+
+@lru_cache(maxsize=None)
+def _smat_fcwb(alpha=False):
+    # cached private function defers construction
+    # until needed (speeding startup),
+    # but avoids repeated reads (speeding later uses)
+    fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha]
+    fpath = smat_path / fname
+
+    return Lookup2d.from_dataframe(
+        pd.read_csv(fpath, index_col=0)
+    )
+
+
+def smat_fcwb(alpha=False):
+    # deepcopied so that mutations do not propagate to cache
+    return deepcopy(_smat_fcwb(alpha))
+
+
+def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True):
+    """Checks functionally that the callable can be used as a score function.
+    Parameters
+    ----------
+    nargs : optional int, default 2
+        How many positional arguments the score function should have.
+    scalar : optional bool, default True
+        Check that the function can be used on ``nargs`` scalars.
+    array : optional bool, default True
+        Check that the function can be used on ``nargs`` 1D ``numpy.ndarray``s.
+    Raises
+    ------
+    ValueError
+        If the score function is not appropriate.
+    """
+    if scalar:
+        scalars = [0.5] * nargs
+        if not isinstance(fn(*scalars), float):
+            raise ValueError("smat does not take 2 floats and return a float")
+
+    if array:
+        test_arr = np.array([0.5] * 3)
+        arrs = [test_arr] * nargs
+        try:
+            out = fn(*arrs)
+        except Exception as e:
+            raise ValueError(f"Failed to use smat with numpy arrays: {e}")
+
+        if out.shape != test_arr.shape:
+            raise ValueError(
+                f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}"
+            )
+
+
+SCORE_FN_DESCR = """
+NBLAST score functions take 2 floats or numpy arrays of floats of length N
+(for matched dotprop points/tangents, distance and dot product;
+the latter possibly scaled by the geometric mean of the alpha colinearity values)
+and returns a float or N-length numpy array of floats.
+""".strip().replace("\n", " ")
+
+
+def parse_score_fn(smat, alpha=False):
+    f"""Interpret ``smat`` as a score function.
+    Primarily for backwards compatibility.
+    {SCORE_FN_DESCR}
+    Parameters
+    ----------
+    smat : None | "auto" | str | os.PathLike | pandas.DataFrame | Callable[[float, float], float]
+        If ``None``, use ``operator.mul``.
+        If ``"auto"``, use ``navis.nbl.smat.smat_fcwb(alpha)``.
+        If a dataframe, use ``navis.nbl.smat.Lookup2d.from_dataframe(smat)``.
+        If another string or path-like, load from CSV in a dataframe and uses as above.
+        Also checks the signature of the callable.
+        Raises an error, probably a ValueError, if it can't be interpreted.
+    alpha : optional bool, default False
+        If ``smat`` is None, choose whether to use the FCWB matrices
+        with or without alpha.
+    Returns
+    -------
+    Callable
+    Raises
+    ------
+    ValueError
+        If score function cannot be interpreted.
+    """
+    if smat is None:
+        smat = operator.mul
+    elif smat == 'auto':
+        smat = smat_fcwb(alpha)
+
+    if isinstance(smat, (str, os.PathLike)):
+        smat = pd.read_csv(smat, index_col=0)
+
+    if isinstance(smat, pd.DataFrame):
+        smat = Lookup2d.from_dataframe(smat)
+
+    if not callable(smat):
+        raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
+
+    check_score_fn(smat)
+
+    return smat
diff --git a/tests/test_nbl/__init__.py b/tests/test_nbl/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
new file mode 100644
index 00000000..93959da2
--- /dev/null
+++ b/tests/test_nbl/test_smat.py
@@ -0,0 +1,161 @@
+from navis import Dotprops
+import pytest
+
+import numpy as np
+
+from navis.nbl.smat import (
+    wrap_bounds, LookupNd, Lookup2d, LookupDistDotBuilder
+)
+
+
+SMALLEST_DIM_SIZE = 3
+SEED = 1991
+
+
+@pytest.mark.parametrize(
+    ["arr", "left", "right", "expected"],
+    [
+        ([1, 2], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]),
+        ([-np.inf, 1, 2, np.inf], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]),
+        ([0, 1, 2, 3], 0, 3, [-np.inf, 1, 2, np.inf]),
+    ],
+)
+def test_wrap_bounds(arr, left, right, expected):
+    assert np.allclose(wrap_bounds(arr, left, right), expected)
+
+
+def test_wrap_bounds_error():
+    with pytest.raises(ValueError):
+        wrap_bounds([1, 2, 1])
+
+
+def lookup_args(ndim):
+    f"""
+    Create arguments for an ND lookup table.
+    The first dimension is of size {SMALLEST_DIM_SIZE},
+    and subsequent dimensions are 1 longer than the previous.
+    The data in the cells are the sequence from 0
+    to the size of the array.
+    The boundaries are 0 to the length of the dimension,
+    with the left and rightmost values replaced with -inf and inf respectively.
+    Examples
+    --------
+    >>> lookup_args(2)
+    (
+        [
+            array([-inf, 1, 2, inf]),
+            array([-inf, 1, 2, 3, inf]),
+        ],
+        array([
+            [ 0,  1,  2,  3 ],
+            [ 4,  5,  6,  7 ],
+            [ 8,  9, 10, 11 ],
+        ]),
+    )
+    """
+    shape = tuple(range(SMALLEST_DIM_SIZE, SMALLEST_DIM_SIZE + ndim))
+    cells = np.arange(np.product(shape)).reshape(shape)
+    boundaries = []
+    for s in shape:
+        b = np.arange(s + 1, dtype=float)
+        b[0] = -np.inf
+        b[-1] = np.inf
+        boundaries.append(b)
+    return boundaries, cells
+
+
+def fmt_array(arg):
+    if np.isscalar(arg):
+        return str(arg)
+    else:
+        return "[" + ",".join(fmt_array(v) for v in arg) + "]"
+
+
+@pytest.mark.parametrize(
+    ["ndim"], [[1], [2], [3], [4], [5]], ids=lambda x: f"{x}D"
+)
+@pytest.mark.parametrize(
+    ["arg"],
+    [
+        (-1000,),
+        (0,),
+        (1,),
+        (1.5,),
+        (2,),
+        (1000,),
+        ([-1000, 0, 1, 1.5, 2, 1000],),
+    ],
+    ids=fmt_array,
+)
+def test_lookupNd(ndim, arg):
+    lookup = LookupNd(*lookup_args(ndim))
+
+    args = [arg for _ in range(ndim)]
+    expected_arr_idx = np.floor([
+        np.clip(arg, 0, dim + SMALLEST_DIM_SIZE - 1) for dim in range(ndim)
+    ]).astype(int)
+    expected_val = np.ravel_multi_index(
+        tuple(expected_arr_idx), lookup.cells.shape
+    )
+
+    response = lookup(*args)
+    assert np.all(response == expected_val)
+
+
+@pytest.mark.parametrize(["strict"], [(False,), (True,)])
+def test_lookup2d_roundtrip(strict):
+    lookup = Lookup2d(*lookup_args(2))
+    df = lookup.to_dataframe()
+    lookup2 = Lookup2d.from_dataframe(df, strict=strict)
+    assert np.allclose(lookup.cells, lookup2.cells)
+    for b1, b2 in zip(lookup.boundaries, lookup2.boundaries):
+        assert np.allclose(b1, b2)
+
+
+def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
+    k = 5
+    dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons]
+    n_orig = len(dotprops)
+
+    # make jittered copies of these neurons
+    rng = np.random.default_rng(SEED)
+    jitter_sigma = 50
+    matching_sets = []
+    for idx, dp in enumerate(dotprops[:]):
+        dotprops.append(
+            Dotprops(
+                dp.points + rng.normal(0, jitter_sigma, dp.points.shape), k
+            )
+        )
+        # assign each neuron its jittered self as a match
+        matching_sets.append({idx, idx + n_orig})
+
+    # original neurons should all not match each other
+    nonmatching = list(range(n_orig))
+
+    # max distance between any 2 points in the data
+    # for calculating dist boundaries
+    max_dist = np.linalg.norm(
+        np.ptp(
+            np.concatenate([dp.points for dp in dotprops], axis=0), axis=0,
+        )
+    )
+
+    return LookupDistDotBuilder(
+        dotprops,
+        matching_sets,
+        np.geomspace(10, max_dist, 5)[:-1],
+        np.linspace(0, 1, 5),
+        nonmatching,
+        alpha,
+        seed=SEED + 1,
+    )
+
+
+@pytest.mark.parametrize(["alpha"], [(True,), (False,)])
+@pytest.mark.parametrize(["threads"], [(0,), (2,), (None,)])
+def test_lookupdistdotbuilder_builds(treeneuron_dfs, threads, alpha):
+    builder = prepare_lookupdistdotbuilder(treeneuron_dfs, alpha)
+    lookup = builder.build(threads)
+    # `pytest -rP` to see output
+    print(lookup.to_dataframe())

From 989c648798b9eca871e9c9bd912224ef6c1c99b1 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Thu, 25 Nov 2021 18:40:21 +0000
Subject: [PATCH 04/21] Fix smat tests

---
 tests/test_nbl/test_smat.py | 45 ++++++++++---------------------------
 1 file changed, 12 insertions(+), 33 deletions(-)

diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
index 93959da2..e1d821fd 100644
--- a/tests/test_nbl/test_smat.py
+++ b/tests/test_nbl/test_smat.py
@@ -4,7 +4,7 @@
 import numpy as np
 
 from navis.nbl.smat import (
-    wrap_bounds, LookupNd, Lookup2d, LookupDistDotBuilder
+    Digitizer, LookupNd, Lookup2d, LookupDistDotBuilder
 )
 
 
@@ -12,23 +12,6 @@
 SEED = 1991
 
 
-@pytest.mark.parametrize(
-    ["arr", "left", "right", "expected"],
-    [
-        ([1, 2], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]),
-        ([-np.inf, 1, 2, np.inf], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]),
-        ([0, 1, 2, 3], 0, 3, [-np.inf, 1, 2, np.inf]),
-    ],
-)
-def test_wrap_bounds(arr, left, right, expected):
-    assert np.allclose(wrap_bounds(arr, left, right), expected)
-
-
-def test_wrap_bounds_error():
-    with pytest.raises(ValueError):
-        wrap_bounds([1, 2, 1])
-
-
 def lookup_args(ndim):
     f"""
     Create arguments for an ND lookup table.
@@ -38,6 +21,7 @@ def lookup_args(ndim):
     to the size of the array.
     The boundaries are 0 to the length of the dimension,
     with the left and rightmost values replaced with -inf and inf respectively.
+
     Examples
     --------
     >>> lookup_args(2)
@@ -55,13 +39,8 @@ def lookup_args(ndim):
     """
     shape = tuple(range(SMALLEST_DIM_SIZE, SMALLEST_DIM_SIZE + ndim))
     cells = np.arange(np.product(shape)).reshape(shape)
-    boundaries = []
-    for s in shape:
-        b = np.arange(s + 1, dtype=float)
-        b[0] = -np.inf
-        b[-1] = np.inf
-        boundaries.append(b)
-    return boundaries, cells
+    digitizers = [Digitizer(np.arange(s + 1, dtype=float)) for s in shape]
+    return digitizers, cells
 
 
 def fmt_array(arg):
@@ -102,14 +81,14 @@ def test_lookupNd(ndim, arg):
     assert np.all(response == expected_val)
 
 
-@pytest.mark.parametrize(["strict"], [(False,), (True,)])
-def test_lookup2d_roundtrip(strict):
-    lookup = Lookup2d(*lookup_args(2))
+def test_lookup2d_roundtrip():
+    digs, cells = lookup_args(2)
+    lookup = Lookup2d(*digs, cells=cells)
     df = lookup.to_dataframe()
-    lookup2 = Lookup2d.from_dataframe(df, strict=strict)
+    lookup2 = Lookup2d.from_dataframe(df)
     assert np.allclose(lookup.cells, lookup2.cells)
-    for b1, b2 in zip(lookup.boundaries, lookup2.boundaries):
-        assert np.allclose(b1, b2)
+    for b1, b2 in zip(lookup.digitizers, lookup2.digitizers):
+        assert b1 == b2
 
 
 def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
@@ -144,8 +123,8 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
     return LookupDistDotBuilder(
         dotprops,
         matching_sets,
-        np.geomspace(10, max_dist, 5)[:-1],
-        np.linspace(0, 1, 5),
+        Digitizer.from_geom(10, max_dist, 5),
+        Digitizer.from_linear(0, 1, 5),
         nonmatching,
         alpha,
         seed=SEED + 1,

From f7d0ae181a2bd03256d7b8e230c323ad64cd0721 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Thu, 25 Nov 2021 18:40:29 +0000
Subject: [PATCH 05/21] Significant refactor of smat.py

---
 navis/nbl/smat.py | 372 +++++++++++++++++++++++++++++-----------------
 1 file changed, 234 insertions(+), 138 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 01dc6f2a..ca829150 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -4,12 +4,13 @@
 import os
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
-from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple
+from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple, Union
 import logging
 from pathlib import Path
 from functools import lru_cache
 from copy import deepcopy
 import operator
+import math
 
 import numpy as np
 import pandas as pd
@@ -25,62 +26,13 @@
 IMPLICIT_INTERVAL = "[)"
 
 fp = Path(__file__).resolve().parent
-smat_path = fp / 'score_mats'
+smat_path = fp / "score_mats"
 
 
 def chunksize(it_len, cpu_count, min_chunk=50):
     return max(min_chunk, int(it_len / (cpu_count * 4)))
 
 
-def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf):
-    """Ensures that boundaries cover -inf to inf.
-    If the left or rightmost values lie within the interval ``(left, right)``,
-    ``-inf`` or ``inf`` will be prepended or appended respectively.
-    Otherwise, the left and right values will be set to
-    ``-inf`` and ``inf`` respectively.
-    Parameters
-    ----------
-    arr : array-like of floats
-        Array of boundaries
-    left : float, optional
-        If the first value is greater than this, prepend -inf;
-        otherwise replace that value with -inf.
-        Defaults to -inf if None.
-    right : float, optional
-        If the last value is less than this, append inf;
-        otherwise replace that value with inf.
-        Defaults to inf if None.
-    Returns
-    -------
-    np.ndarray of floats
-        Boundaries from -inf to inf
-    Raises
-    ------
-    ValueError
-        If values of ``arr`` are not monotonically increasing.
-    """
-    if left is None:
-        left = -np.inf
-    if right is None:
-        right = np.inf
-
-    if not np.all(np.diff(arr) > 0):
-        raise ValueError("Boundaries are not monotonically increasing")
-
-    items = list(arr)
-    if items[0] <= left:
-        items[0] = -np.inf
-    else:
-        items.insert(0, -np.inf)
-
-    if items[-1] >= right:
-        items[-1] = np.inf
-    else:
-        items.append(np.inf)
-
-    return np.array(items)
-
-
 def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any]]:
     for a, b in pairs:
         if a != b:
@@ -92,7 +44,7 @@ def __init__(
         self,
         dotprops,
         matching_sets,
-        boundaries: Sequence[Sequence[float]],
+        digitizers: Sequence[Digitizer],
         match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]],
         nonmatching=None,
         seed=DEFAULT_SEED,
@@ -105,15 +57,8 @@ def __init__(
             as Dotprops objects.
         matching_sets : list of sets of int
             Sets of neurons, as indices into ``dotprops``, which should be considered matches.
-        boundaries : sequence of array-likes of floats
-            List of lists, where the inner lists are monotonically increasing
-            from -inf to inf.
-            The length of the outer list is the dimensionality of the lookup table.
-            The inner lists are the boundaries of bins for that dimension,
-            i.e. an inner list of N items describes N-1 bins.
-            If an inner list is not inf-bounded,
-            -inf and inf will be prepended and appended.
-            See the ``wrap_bounds`` convenience function.
+        digitizers : sequence of Digitizers
+            The length is the dimensionality of the lookup table.
         match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]]
             Function taking 2 arguments,
             both instances of ``navis.core.neurons.Dotprops``,
@@ -136,7 +81,7 @@ def __init__(
         self.matching_sets = matching_sets
         self._nonmatching = nonmatching
         self.match_fn = match_fn
-        self.boundaries = [wrap_bounds(b) for b in boundaries]
+        self.digitizers = list(digitizers)
 
         self.rng = np.random.default_rng(seed)
 
@@ -162,12 +107,12 @@ def _yield_nonmatching_pairs(self):
         return yield_not_same(permutations(self.nonmatching, 2))
 
     def _empty_counts(self):
-        shape = [len(b) - 1 for b in self.boundaries]
+        shape = [len(b) for b in self.digitizers]
         return np.zeros(shape, int)
 
     def _query_to_idxs(self, q_idx, t_idx, counts=None):
         response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx])
-        idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)]
+        idxs = [dig(r) for dig, r in zip(self.digitizers, response)]
 
         if counts is None:
             counts = self._empty_counts()
@@ -216,7 +161,7 @@ def _pick_nonmatching_pairs(self, n_matching_qual_vals):
 
         return nonmatching_pairs
 
-    def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
+    def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]:
         matching_pairs = list(set(self._yield_matching_pairs()))
         # need to know the eventual distdot count
         # so we know how many non-matching pairs to draw
@@ -239,7 +184,7 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]:
             (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon)
         )
 
-        return self.boundaries, cells
+        return self.digitizers, cells
 
     def build(self, threads=None) -> LookupNd:
         """Build the score matrix.
@@ -265,7 +210,8 @@ def build(self, threads=None) -> LookupNd:
         ValueError
             If dist_bins or dot_bins are not set.
         """
-        return LookupNd(*self._build(threads))
+        dig, cells = self._build(threads)
+        return LookupNd(dig, cells)
 
 
 def dist_dot(q: Dotprops, t: Dotprops):
@@ -282,8 +228,8 @@ def __init__(
         self,
         dotprops,
         matching_sets,
-        dist_boundaries,
-        dot_boundaries,
+        dist_digitizer: Digitizer,
+        dot_digitizer: Digitizer,
         nonmatching=None,
         use_alpha=False,
         seed=DEFAULT_SEED,
@@ -334,106 +280,256 @@ def __init__(
         super().__init__(
             dotprops,
             matching_sets,
-            [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)],
+            [dist_digitizer, dot_digitizer],
             match_fn,
             nonmatching,
             seed,
         )
 
     def build(self, threads=None) -> Lookup2d:
-        return Lookup2d(*self._build(threads))
+        (dig0, dig1), cells = self._build(threads)
+        return Lookup2d(dig0, dig1, cells)
+
+
+def is_monotonically_increasing(lst):
+    for prev_idx, item in enumerate(lst[1:]):
+        if item <= lst[prev_idx]:
+            return False
+    return True
+
+
+def parse_boundary(item: str):
+    explicit_interval = item[0] + item[-1]
+    if explicit_interval == "[)":
+        right = False
+    elif explicit_interval == "(]":
+        right = True
+    else:
+        raise ValueError(
+            f"Enclosing characters '{explicit_interval}' do not match a half-open interval"
+        )
+    return tuple(float(i) for i in item[1:-1].split(",")), right
+
+
+class Digitizer:
+    def __init__(
+        self,
+        boundaries: Sequence[float],
+        clip: Tuple[bool, bool] = (True, True),
+        right=False,
+    ):
+        self.right = right
+
+        boundaries = list(boundaries)
+        self._min = boundaries[0]
+        if clip[0]:
+            boundaries[0] = -math.inf
+        elif boundaries[0] != -math.inf:
+            boundaries.insert(0, -math.inf)
+
+        self._max = boundaries[-1]
+        if clip[1]:
+            boundaries[-1] = math.inf
+        elif boundaries[-1] != math.inf:
+            boundaries.append(math.inf)
+
+        if not is_monotonically_increasing(boundaries):
+            raise ValueError("Boundaries are not monotonically increasing")
+
+        self.boundaries = np.asarray(boundaries)
+
+    def __len__(self):
+        return len(self.boundaries) - 1
+
+    def __call__(self, value: float):
+        # searchsorted is marginally faster than digitize as it skips monotonicity checks
+        return (
+            np.searchsorted(
+                self.boundaries, value, side="left" if self.right else "right"
+            )
+            - 1
+        )
+
+    def to_strings(self) -> List[str]:
+        if self.right:
+            lb = "("
+            rb = "]"
+        else:
+            lb = "["
+            rb = ")"
+
+        return [
+            f"{lb}{lower},{upper}{rb}"
+            for lower, upper in zip(self.boundaries[:-1], self.boundaries[1:])
+        ]
+
+    @classmethod
+    def from_strings(
+        cls, bound_strs: Sequence[str]
+    ):
+        """Set digitizer boundaries based on a sequence of interval expressions.
+
+        e.g. ``["(0, 1]", "(1, 5]", "(5, 10]"]``
+
+        The lowermost and uppermost boundaries are converted to -infinity and infinity respectively.
+
+        Parameters
+        ----------
+        bound_strs : Sequence[str]
+            Strings representing intervals, which must abut and have open/closed boundaries
+            specified by brackets.
+
+        Returns
+        -------
+        Digitizer
+        """
+        bounds: List[float] = []
+        last_upper = None
+        last_right = None
+        for item in bound_strs:
+            (lower, upper), right = parse_boundary(item)
+            bounds.append(float(lower))
+
+            if last_right is not None:
+                if right != last_right:
+                    raise ValueError("Inconsistent half-open interval")
+            else:
+                last_right = right
+
+            if last_upper is not None:
+                if lower != last_upper:
+                    raise ValueError("Half-open intervals do not abut")
+
+            last_upper = upper
+
+        bounds.append(float(last_upper))
+        return cls(bounds, right=last_right)
+
+    @classmethod
+    def from_linear(cls, lower: float, upper: float, nbins: int, right=False):
+        """Choose digitizer boundaries spaced linearly between two values.
+
+        Parameters
+        ----------
+        lower : float
+            Lowest value
+        upper : float
+            Highest value
+        nbins : int
+            Number of bins
+        right : bool, optional
+            Whether bins should include their right (rather than left) boundary,
+            by default False
+
+        Returns
+        -------
+        Digitizer
+        """
+        arr = np.linspace(lower, upper, nbins + 1, endpoint=True)
+        return cls(arr, right=right)
+
+    @classmethod
+    def from_geom(cls, lowest_upper: float, upper: float, nbins: int, right=False):
+        """Choose digitizer boundaries in a geometric sequence.
+
+        Parameters
+        ----------
+        lowest_upper : float
+            Upper bound of the lowest bin.
+            The lower bound of the lowest bin is often 0, which cannot be represented in a nontrivial geometric sequence.
+        upper : float
+            Upper bound of the highest bin.
+            This will be replaced by +inf, but the value is used for setting the rest of the sequence.
+        nbins : int
+            Number of bins
+        right : bool, optional
+            Whether bins should include their right (rather than left) boundary,
+            by default False
+
+        Returns
+        -------
+        Digitizer
+        """
+        # Because geom can't start at 0, instead start at the upper bound
+        # of the lowest bin, then prepend -inf.
+        # Because the extra bin is added, do not need to add 1 to nbins in geomspace.
+        arr = np.geomspace(lowest_upper, upper, nbins, True)
+        return cls(arr, clip=(False, True), right=right)
+
+    @classmethod
+    def from_data(cls, data: Sequence[float], nbins: int, right=False):
+        """Choose digitizer boundaries to evenly partition the given values.
+
+        Parameters
+        ----------
+        data : Sequence[float]
+            Data which should be evenly partitioned by the resulting digitizer.
+        nbins : int
+            Number of bins
+        right : bool, optional
+            Whether bins should include their right (rather than left) boundary,
+            by default False
+
+        Returns
+        -------
+        Digitizer
+        """
+        arr = np.quantile(data, np.linspace(0, 1, nbins + 1, True))
+        return cls(arr, right=right)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Digitizer):
+            return NotImplemented
+        return self.right == other.right and np.allclose(self.boundaries, other.boundaries)
 
 
 class LookupNd:
-    def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray):
-        if [len(b) - 1 for b in boundaries] != list(cells.shape):
+    def __init__(self, digitizers: List[Digitizer], cells: np.ndarray):
+        if [len(b) for b in digitizers] != list(cells.shape):
             raise ValueError("boundaries and cells have inconsistent bin counts")
-        self.boundaries = boundaries
+        self.digitizers = digitizers
         self.cells = cells
 
     def __call__(self, *args):
-        if len(args) != len(self.boundaries):
+        if len(args) != len(self.digitizers):
             raise TypeError(
-                f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given"
+                f"Lookup takes {len(self.digitizers)} arguments but {len(args)} were given"
             )
 
-        idxs = tuple(
-            np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries)
-        )
+        idxs = tuple(d(arg) for d, arg in zip(self.digitizers, args))
         out = self.cells[idxs]
         return out
 
 
-def format_boundaries(arr):
-    return [f"[{lower},{upper})" for lower, upper in zip(arr[:-1], arr[1:])]
-
-
-def parse_boundary(item, strict=False):
-    explicit_interval = item[0] + item[-1]
-    if strict and item[0] + item[-1] != IMPLICIT_INTERVAL:
-        raise ValueError(
-            f"Enclosing characters {explicit_interval} "
-            f"do not match implicit interval specifiers {IMPLICIT_INTERVAL}"
-        )
-    return tuple(float(i) for i in item[1:-1].split(","))
-
-
-def parse_boundaries(items, strict=False):
-    # declaring upper first and then checking for None
-    # pleases the type checker, and handles the case where
-    # len(items) == 0
-    upper = None
-    for item in items:
-        lower, upper = parse_boundary(item, strict)
-        yield lower
-    if upper is None:
-        return
-    yield upper
-
-
 class Lookup2d(LookupNd):
     """Convenience class inheriting from LookupNd for the common 2D case.
     Provides IO with pandas DataFrames.
     """
 
-    def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray):
-        if len(boundaries) != 2:
-            raise ValueError("boundaries must be of length 2; cells must be 2D")
-        super().__init__(boundaries, cells)
+    def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarray):
+        super().__init__([digitizer0, digitizer1], cells)
 
     def to_dataframe(self) -> pd.DataFrame:
-        # numpy.digitize includes left, excludes right, i.e. "[left,right)"
         return pd.DataFrame(
             self.cells,
-            format_boundaries(self.boundaries[0]),
-            format_boundaries(self.boundaries[1]),
+            self.digitizers[0].to_strings(),
+            self.digitizers[1].to_strings(),
         )
 
     @classmethod
-    def from_dataframe(cls, df: pd.DataFrame, strict=False):
+    def from_dataframe(cls, df: pd.DataFrame):
         f"""Parse score matrix from a dataframe with string index and column labels.
+
         Expects the index and column labels to specify an interval
         like ``f"[{{lower}},{{upper}})"``.
         Will replace the lowermost and uppermost bound with -inf and inf
         if they are not already.
-        Parameters
-        ----------
-        strict : bool, optional
-            If falsey (default), ignores parentheses and brackets,
-            effectively normalising to
-            ``{IMPLICIT_INTERVAL[0]}lower,upper{IMPLICIT_INTERVAL[1]}``
-            as an implementation detail.
-            Otherwise, raises a ValueError if parens/brackets
-            do not match the implementation.
         """
-        boundaries = []
-        for arr in (df.index, df.columns):
-            b = np.array(list(parse_boundaries(arr, strict)))
-            b[0] = -np.inf
-            b[-1] = np.inf
-            boundaries.append(b)
-
-        return cls(boundaries, df.to_numpy())
+        return cls(
+            Digitizer.from_strings(df.index),
+            Digitizer.from_strings(df.columns),
+            df.to_numpy(),
+        )
 
 
 @lru_cache(maxsize=None)
@@ -444,9 +540,7 @@ def _smat_fcwb(alpha=False):
     fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha]
     fpath = smat_path / fname
 
-    return Lookup2d.from_dataframe(
-        pd.read_csv(fpath, index_col=0)
-    )
+    return Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0))
 
 
 def smat_fcwb(alpha=False):
@@ -489,7 +583,7 @@ def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True):
 
 
 SCORE_FN_DESCR = """
-NBLAST score functions take 2 floats or numpy arrays of floats of length N
+NBLAST score functions take 2 floats or N-length numpy arrays of floats
 (for matched dotprop points/tangents, distance and dot product;
 the latter possibly scaled by the geometric mean of the alpha colinearity values)
 and returns a float or N-length numpy array of floats.
@@ -510,7 +604,7 @@ def parse_score_fn(smat, alpha=False):
         Also checks the signature of the callable.
         Raises an error, probably a ValueError, if it can't be interpreted.
     alpha : optional bool, default False
-        If ``smat`` is None, choose whether to use the FCWB matrices
+        If ``smat`` is ``"auto"``, choose whether to use the FCWB matrices
         with or without alpha.
     Returns
     -------
@@ -522,7 +616,7 @@ def parse_score_fn(smat, alpha=False):
     """
     if smat is None:
         smat = operator.mul
-    elif smat == 'auto':
+    elif smat == "auto":
         smat = smat_fcwb(alpha)
 
     if isinstance(smat, (str, os.PathLike)):
@@ -532,7 +626,9 @@ def parse_score_fn(smat, alpha=False):
         smat = Lookup2d.from_dataframe(smat)
 
     if not callable(smat):
-        raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'")
+        raise ValueError(
+            "smat should be a callable, a path, a pandas.DataFrame, or 'auto'"
+        )
 
     check_score_fn(smat)
 

From 7ad57322f431480a89d1a0f3642ce52b2461eac9 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Fri, 26 Nov 2021 14:31:38 +0000
Subject: [PATCH 06/21] smat: calculate digitizers from data

---
 navis/nbl/smat.py | 259 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 181 insertions(+), 78 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index ca829150..b30befc8 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -4,13 +4,25 @@
 import os
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
-from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple, Union
+from typing import (
+    Hashable,
+    Iterator,
+    Mapping,
+    Optional,
+    Sequence,
+    Callable,
+    List,
+    Iterable,
+    Any,
+    Tuple,
+)
 import logging
 from pathlib import Path
 from functools import lru_cache
 from copy import deepcopy
 import operator
 import math
+from collections import defaultdict
 
 import numpy as np
 import pandas as pd
@@ -39,26 +51,43 @@ def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any]
             yield a, b
 
 
+def concat_results(results: Iterable[List[np.ndarray]]) -> List[np.ndarray]:
+    intermediate = defaultdict(list)
+    for result_lst in results:
+        for idx, array in enumerate(result_lst):
+            intermediate[idx].append(array)
+
+    return [np.concatenate(arrs) for arrs in intermediate.values()]
+
+
+DotpropKey = Hashable
+
+
 class LookupNdBuilder:
     def __init__(
         self,
-        dotprops,
-        matching_sets,
-        digitizers: Sequence[Digitizer],
+        dotprops: Mapping[DotpropKey, Dotprops],
+        matching_lists: List[List[DotpropKey]],
         match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]],
-        nonmatching=None,
-        seed=DEFAULT_SEED,
+        nonmatching_list: Optional[List[DotpropKey]] = None,
+        seed: int = DEFAULT_SEED,
     ) -> None:
         f"""Class for building an N-dimensional score lookup for NBLAST.
+
+        Once instantiated, the axes of the lookup table must be defined.
+        Call ``.with_digitizers()`` to manually define them,
+        or ``.with_bin_counts()`` to learn them from the matched-pair data.
+
+        Then call ``.build()`` to build the lookup table.
+
         Parameters
         ----------
+
         dotprops : dict or list of Dotprops
-            An indexable sequence of all neurons which will be used as the training set,
-            as Dotprops objects.
-        matching_sets : list of sets of int
-            Sets of neurons, as indices into ``dotprops``, which should be considered matches.
-        digitizers : sequence of Digitizers
-            The length is the dimensionality of the lookup table.
+            An indexable, consistently-ordered sequence of all neurons
+            which will be used as the training set, as Dotprops objects.
+        matching_sets : list of lists of index into dotprops
+            Lists of neurons, as indices into ``dotprops``, which should be considered matches.
         match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]]
             Function taking 2 arguments,
             both instances of ``navis.core.neurons.Dotprops``,
@@ -68,8 +97,8 @@ def __init__(
             as the number of points in the first argument.
             This function returns values describing the quality of
             point matches from a query to a target neuron.
-        nonmatching : set of int, optional
-            Set of neurons, as indices into ``dotprops``,
+        nonmatching : list of index into dotprops, optional
+            List of neurons, as indices into ``dotprops``,
             which should not be considered matches.
             If not given, all ``dotprops`` will be used
             (on the assumption that matches are a small subset of possible pairs).
@@ -78,41 +107,106 @@ def __init__(
             by default {DEFAULT_SEED}
         """
         self.dotprops = dotprops
-        self.matching_sets = matching_sets
-        self._nonmatching = nonmatching
+        self.matching_lists = matching_lists
+        self._nonmatching_list = nonmatching_list
         self.match_fn = match_fn
-        self.digitizers = list(digitizers)
+
+        self.digitizers: Optional[List[Digitizer]] = None
+        self.bin_counts: Optional[List[int]] = None
 
         self.rng = np.random.default_rng(seed)
 
-    def _dotprop_keys(self):
+    def with_digitizers(self, digitizers: List[Digitizer]):
+        """Specify the axes of the output lookup table directly.
+
+        Parameters
+        ----------
+        digitizers : List[Digitizer]
+
+        Returns
+        -------
+        self
+            For chaining convenience.
+        """
+        self.digitizers = digitizers
+        return self
+
+    def with_bin_counts(self, bin_counts: List[int]):
+        """Specify the number of bins on each axis of the output lookup table.
+
+        The bin boundaries will be determined by evenly partitioning the data
+        from the matched pairs into quantiles, in each dimension.
+
+        Parameters
+        ----------
+        bin_counts : List[int]
+
+        Returns
+        -------
+        self
+            For chaining convenience.
+        """
+        self.bin_counts = bin_counts
+        return self
+
+    def _dotprop_keys(self) -> Sequence[DotpropKey]:
+        """Get all indices into dotprops instance member"""
         try:
             return self.dotprops.keys()
         except AttributeError:
             return range(len(self.dotprops))
 
     @property
-    def nonmatching(self):
+    def nonmatching(self) -> List[DotpropKey]:
         """Indices of nonmatching set of neurons"""
-        if self._nonmatching is None:
-            return set(self._dotprop_keys())
-        return self._nonmatching
+        if self._nonmatching_list is None:
+            return list(self._dotprop_keys())
+        return self._nonmatching_list
 
-    def _yield_matching_pairs(self):
-        for ms in self.matching_sets:
+    def _yield_matching_pairs(self) -> Iterator[Tuple[DotpropKey, DotpropKey]]:
+        """Yield all index pairs within all matching pairs"""
+        for ms in self.matching_lists:
             yield from yield_not_same(permutations(ms, 2))
 
-    def _yield_nonmatching_pairs(self):
+    def _yield_nonmatching_pairs(self) -> Iterator[Tuple[DotpropKey, DotpropKey]]:
+        """Yield all index pairs within nonmatching list"""
         # todo: this could be much better, use meshgrid or shuffle index arrays
         return yield_not_same(permutations(self.nonmatching, 2))
 
-    def _empty_counts(self):
+    def _empty_counts(self) -> np.ndarray:
+        """Create an empty array in which to store counts; shape determined by digitizer sizes."""
         shape = [len(b) for b in self.digitizers]
         return np.zeros(shape, int)
 
+    def _query(self, q_idx, t_idx) -> List[np.ndarray]:
+        """Get the results of applying the match function to dotprops specified by indices"""
+        return self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx])
+
+    def _query_many(self, idx_pairs, threads=None) -> Iterator[List[np.ndarray]]:
+        """Yield results from querying many pairs of dotprop indices"""
+        if threads is None or threads == 0 and cpu_count == 1:
+            for q_idx, t_idx in idx_pairs:
+                yield self._query(q_idx, t_idx)
+
+        threads = threads or cpu_count
+        idx_pairs = np.asarray(idx_pairs)
+        chunks = chunksize(len(idx_pairs), threads)
+
+        with ProcessPoolExecutor(threads) as exe:
+            yield from exe.map(
+                self._query, idx_pairs[:, 0], idx_pairs[:, 1], chunksize=chunks
+            )
+
     def _query_to_idxs(self, q_idx, t_idx, counts=None):
-        response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx])
-        idxs = [dig(r) for dig, r in zip(self.digitizers, response)]
+        """Produce a digitized counts array from a given query-target pair"""
+        return self._count_results(self._query(q_idx, t_idx), counts)
+
+    def _count_results(self, results: List[np.ndarray], counts=None):
+        """Convert raw match function ouput into a digitized counts array.
+
+        Requires digitizers.
+        """
+        idxs = [dig(r) for dig, r in zip(self.digitizers, results)]
 
         if counts is None:
             counts = self._empty_counts()
@@ -123,6 +217,10 @@ def _query_to_idxs(self, q_idx, t_idx, counts=None):
         return counts
 
     def _counts_array(self, idx_pairs, threads=None):
+        """Convert index pairs into a digitized counts array.
+
+        Requires digitizers.
+        """
         counts = self._empty_counts()
         if threads is None or threads == 0 and cpu_count == 1:
             for q_idx, t_idx in idx_pairs:
@@ -133,6 +231,8 @@ def _counts_array(self, idx_pairs, threads=None):
         idx_pairs = np.asarray(idx_pairs, dtype=int)
         chunks = chunksize(len(idx_pairs), threads)
 
+        # because digitizing is not necessarily free,
+        # keep this parallelisation separate to that in _query_many
         with ProcessPoolExecutor(threads) as exe:
             for these_counts in exe.map(
                 self._query_to_idxs,
@@ -145,6 +245,7 @@ def _counts_array(self, idx_pairs, threads=None):
         return counts
 
     def _pick_nonmatching_pairs(self, n_matching_qual_vals):
+        """Using the seeded RNG, pick which nonmatching pairs to use."""
         # pre-calculating which pairs we're going to use,
         # rather than drawing them as we need them,
         # means that we can parallelise the later step more effectively.
@@ -162,6 +263,11 @@ def _pick_nonmatching_pairs(self, n_matching_qual_vals):
         return nonmatching_pairs
 
     def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]:
+        if self.digitizers is None and self.bin_counts is None:
+            raise ValueError(
+                "Builder needs either digitizers or bin_counts; see with_* methods"
+            )
+
         matching_pairs = list(set(self._yield_matching_pairs()))
         # need to know the eventual distdot count
         # so we know how many non-matching pairs to draw
@@ -171,8 +277,16 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]:
         )
 
         nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals)
+        if self.digitizers:
+            match_counts = self._counts_array(matching_pairs, threads)
+        else:
+            match_results = concat_results(self._query_many(matching_pairs, threads))
+            self.digitizers = [
+                Digitizer.from_data(data, nbins)
+                for data, nbins in zip(match_results, self.bin_counts)
+            ]
+            match_counts = self._count_results(match_results)
 
-        match_counts = self._counts_array(matching_pairs, threads)
         nonmatch_counts = self._counts_array(nonmatching_pairs, threads)
 
         # account for there being different total numbers of match/nonmatch dist dots
@@ -188,27 +302,26 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]:
 
     def build(self, threads=None) -> LookupNd:
         """Build the score matrix.
+
         All non-identical neuron pairs within all matching sets are selected,
         and distdots calculated for those pairs.
         Then, the minimum number of non-matching pairs are randomly drawn
         so that at least as many distdots can be calculated for non-matching
         pairs.
+
         In each bin of the score matrix, the log2 odds ratio of a distdot
         in that bin belonging to a match vs. non-match is calculated.
+
         Parameters
         ----------
         threads : int, optional
             If None, act in serial.
             If 0, use cpu_count - 1.
             Otherwise, use the given value.
+
         Returns
         -------
-        pd.DataFrame
-            Suitable for passing to navis.nbl.ScoringFunction
-        Raises
-        ------
-        ValueError
-            If dist_bins or dot_bins are not set.
+        LookupNd
         """
         dig, cells = self._build(threads)
         return LookupNd(dig, cells)
@@ -226,45 +339,27 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops):
 class LookupDistDotBuilder(LookupNdBuilder):
     def __init__(
         self,
-        dotprops,
-        matching_sets,
-        dist_digitizer: Digitizer,
-        dot_digitizer: Digitizer,
-        nonmatching=None,
-        use_alpha=False,
-        seed=DEFAULT_SEED,
+        dotprops: Mapping[DotpropKey, Dotprops],
+        matching_lists: List[List[DotpropKey]],
+        nonmatching_list: Optional[List[DotpropKey]] = None,
+        use_alpha: bool = False,
+        seed: int = DEFAULT_SEED,
     ):
         f"""Class for building a 2-dimensional score lookup for NBLAST.
         The scores are
         1. The distances between best-matching points
         2. The dot products of direction vectors around those points,
             optionally scaled by the colinearity ``alpha``.
+
         Parameters
         ----------
         dotprops : dict or list of Dotprops
             An indexable sequence of all neurons which will be used as the training set,
             as Dotprops objects.
-        matching_sets : list of sets of int
-            Sets of neurons, as indices into ``dotprops``, which should be considered matches.
-        dist_boundaries : array-like of floats
-            Monotonically increasing boundaries between distance bins
-            (i.e. N values makes N-1 bins).
-            If the first value > 0, -inf will be prepended,
-            otherwise the first value will be replaced with -inf.
-            inf will be appended unless the last value is already inf.
-            Consider using ``numpy.logspace`` or ``numpy.geomspace``
-            to generate the inner boundaries of the bins.
-        dot_boundaries : array-like of floats
-            Monotonically increasing boundaries between
-            (possibly alpha-scaled) dot product bins
-            (i.e. N values makes N-1 bins).
-            If the first value > 0, -inf will be prepended,
-            otherwise the first value will be replaced with -inf.
-            If the last value < 1, inf will be appended,
-            otherwise the last value will be replaced with inf.
-            Consider using ``numpy.linspace`` between 0 and 1.
-        nonmatching : set of int, optional
-            Set of neurons, as indices into ``dotprops``,
+        matching_lists : list of lists of indices into dotprops
+            List of neurons, as indices into ``dotprops``, which should be considered matches.
+        nonmatching_list : list of indices into dotprops, optional
+            List of neurons, as indices into ``dotprops``,
             which should not be considered matches.
             If not given, all ``dotprops`` will be used
             (on the assumption that matches are a small subset of possible pairs).
@@ -279,14 +374,17 @@ def __init__(
         match_fn = dist_dot_alpha if use_alpha else dist_dot
         super().__init__(
             dotprops,
-            matching_sets,
-            [dist_digitizer, dot_digitizer],
+            matching_lists,
             match_fn,
-            nonmatching,
+            nonmatching_list,
             seed,
         )
 
     def build(self, threads=None) -> Lookup2d:
+        if (self.digitizers and len(self.digitizers) != 2) or (
+            self.bin_counts and len(self.bin_counts) != 2
+        ):
+            raise ValueError("Expected 2 axes")
         (dig0, dig1), cells = self._build(threads)
         return Lookup2d(dig0, dig1, cells)
 
@@ -364,9 +462,7 @@ def to_strings(self) -> List[str]:
         ]
 
     @classmethod
-    def from_strings(
-        cls, bound_strs: Sequence[str]
-    ):
+    def from_strings(cls, interval_strs: Sequence[str]):
         """Set digitizer boundaries based on a sequence of interval expressions.
 
         e.g. ``["(0, 1]", "(1, 5]", "(5, 10]"]``
@@ -386,7 +482,7 @@ def from_strings(
         bounds: List[float] = []
         last_upper = None
         last_right = None
-        for item in bound_strs:
+        for item in interval_strs:
             (lower, upper), right = parse_boundary(item)
             bounds.append(float(lower))
 
@@ -409,6 +505,8 @@ def from_strings(
     def from_linear(cls, lower: float, upper: float, nbins: int, right=False):
         """Choose digitizer boundaries spaced linearly between two values.
 
+        Input values will be clipped to fit within the given interval.
+
         Parameters
         ----------
         lower : float
@@ -429,17 +527,18 @@ def from_linear(cls, lower: float, upper: float, nbins: int, right=False):
         return cls(arr, right=right)
 
     @classmethod
-    def from_geom(cls, lowest_upper: float, upper: float, nbins: int, right=False):
+    def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False):
         """Choose digitizer boundaries in a geometric sequence.
 
+        Additional bins will be added above and below the given values.
+
         Parameters
         ----------
         lowest_upper : float
             Upper bound of the lowest bin.
             The lower bound of the lowest bin is often 0, which cannot be represented in a nontrivial geometric sequence.
-        upper : float
-            Upper bound of the highest bin.
-            This will be replaced by +inf, but the value is used for setting the rest of the sequence.
+        highest_lower : float
+            Lower bound of the highest bin.
         nbins : int
             Number of bins
         right : bool, optional
@@ -453,8 +552,8 @@ def from_geom(cls, lowest_upper: float, upper: float, nbins: int, right=False):
         # Because geom can't start at 0, instead start at the upper bound
         # of the lowest bin, then prepend -inf.
         # Because the extra bin is added, do not need to add 1 to nbins in geomspace.
-        arr = np.geomspace(lowest_upper, upper, nbins, True)
-        return cls(arr, clip=(False, True), right=right)
+        arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True)
+        return cls(arr, clip=False, right=right)
 
     @classmethod
     def from_data(cls, data: Sequence[float], nbins: int, right=False):
@@ -480,7 +579,9 @@ def from_data(cls, data: Sequence[float], nbins: int, right=False):
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Digitizer):
             return NotImplemented
-        return self.right == other.right and np.allclose(self.boundaries, other.boundaries)
+        return self.right == other.right and np.allclose(
+            self.boundaries, other.boundaries
+        )
 
 
 class LookupNd:
@@ -587,7 +688,9 @@ def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True):
 (for matched dotprop points/tangents, distance and dot product;
 the latter possibly scaled by the geometric mean of the alpha colinearity values)
 and returns a float or N-length numpy array of floats.
-""".strip().replace("\n", " ")
+""".strip().replace(
+    "\n", " "
+)
 
 
 def parse_score_fn(smat, alpha=False):

From 902ed7401409cdfef283ec2560c9721c2ddd25d2 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Fri, 26 Nov 2021 14:52:19 +0000
Subject: [PATCH 07/21] Fix tests, finish docs

---
 navis/nbl/smat.py           | 25 +++++++++++++++++++------
 tests/test_nbl/test_smat.py | 13 +++++++------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index b30befc8..29a62162 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -15,6 +15,7 @@
     Iterable,
     Any,
     Tuple,
+    Union,
 )
 import logging
 from pathlib import Path
@@ -66,7 +67,7 @@ def concat_results(results: Iterable[List[np.ndarray]]) -> List[np.ndarray]:
 class LookupNdBuilder:
     def __init__(
         self,
-        dotprops: Mapping[DotpropKey, Dotprops],
+        dotprops: Union[List[Dotprops], Mapping[DotpropKey, Dotprops]],
         matching_lists: List[List[DotpropKey]],
         match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]],
         nonmatching_list: Optional[List[DotpropKey]] = None,
@@ -339,7 +340,7 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops):
 class LookupDistDotBuilder(LookupNdBuilder):
     def __init__(
         self,
-        dotprops: Mapping[DotpropKey, Dotprops],
+        dotprops: Union[List[Dotprops], Mapping[DotpropKey, Dotprops]],
         matching_lists: List[List[DotpropKey]],
         nonmatching_list: Optional[List[DotpropKey]] = None,
         use_alpha: bool = False,
@@ -416,6 +417,21 @@ def __init__(
         clip: Tuple[bool, bool] = (True, True),
         right=False,
     ):
+        """Class converting continuous values into discrete indices given specific bin boundaries.
+
+        Parameters
+        ----------
+        boundaries : Sequence[float]
+            N boundaries specifying N-1 bins.
+            Must be monotonically increasing.
+        clip : Tuple[bool, bool], optional
+            Whether to set the bottom and top boundaries to -infinity and infinity respectively,
+            effectively clipping incoming values: by default (True, True).
+            False means "add a new bin for out-of-range values".
+        right : bool, optional
+            Whether bins should include their right (rather than left) boundary,
+            by default False
+        """
         self.right = right
 
         boundaries = list(boundaries)
@@ -549,11 +565,8 @@ def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=
         -------
         Digitizer
         """
-        # Because geom can't start at 0, instead start at the upper bound
-        # of the lowest bin, then prepend -inf.
-        # Because the extra bin is added, do not need to add 1 to nbins in geomspace.
         arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True)
-        return cls(arr, clip=False, right=right)
+        return cls(arr, clip=(False, False), right=right)
 
     @classmethod
     def from_data(cls, data: Sequence[float], nbins: int, right=False):
diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
index e1d821fd..76d7f9fb 100644
--- a/tests/test_nbl/test_smat.py
+++ b/tests/test_nbl/test_smat.py
@@ -99,7 +99,7 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
     # make jittered copies of these neurons
     rng = np.random.default_rng(SEED)
     jitter_sigma = 50
-    matching_sets = []
+    matching_lists = []
     for idx, dp in enumerate(dotprops[:]):
         dotprops.append(
             Dotprops(
@@ -107,7 +107,7 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
             )
         )
         # assign each neuron its jittered self as a match
-        matching_sets.append({idx, idx + n_orig})
+        matching_lists.append([idx, idx + n_orig])
 
     # original neurons should all not match each other
     nonmatching = list(range(n_orig))
@@ -122,13 +122,14 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5):
 
     return LookupDistDotBuilder(
         dotprops,
-        matching_sets,
-        Digitizer.from_geom(10, max_dist, 5),
-        Digitizer.from_linear(0, 1, 5),
+        matching_lists,
         nonmatching,
         alpha,
         seed=SEED + 1,
-    )
+    ).with_digitizers([
+        Digitizer.from_geom(10, max_dist, 5),
+        Digitizer.from_linear(0, 1, 5),
+    ])
 
 
 @pytest.mark.parametrize(["alpha"], [(True,), (False,)])

From fa87ae32616aa0e1b4f5863e50469288d612c792 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Fri, 26 Nov 2021 18:08:38 +0000
Subject: [PATCH 08/21] smat: cache match results

---
 .gitignore        |  2 ++
 navis/nbl/smat.py | 69 ++++++++++++++++++++++++++++++++++-------------
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index da2d03ca..d35c0bea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,3 +130,5 @@ ENV/
 #*.swc
 #*.json
 MANIFEST
+
+tmp/
diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 29a62162..f27fd96e 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -115,7 +115,16 @@ def __init__(
         self.digitizers: Optional[List[Digitizer]] = None
         self.bin_counts: Optional[List[int]] = None
 
-        self.rng = np.random.default_rng(seed)
+        self.seed = seed
+        self._ndim: Optional[int] = None
+
+    @property
+    def ndim(self) -> int:
+        if self._ndim is None:
+            idx1, idx2 = self._dotprop_keys()[:2]
+            self._ndim = len(self._query(idx1, idx2))
+            self._query.cache_clear()
+        return self._ndim
 
     def with_digitizers(self, digitizers: List[Digitizer]):
         """Specify the axes of the output lookup table directly.
@@ -129,7 +138,14 @@ def with_digitizers(self, digitizers: List[Digitizer]):
         self
             For chaining convenience.
         """
+        if len(digitizers) != self.ndim:
+            raise ValueError(
+                f"Match function returns {self.ndim} values "
+                f"but provided {len(digitizers)} digitizers"
+            )
+
         self.digitizers = digitizers
+        self.bin_counts = None
         return self
 
     def with_bin_counts(self, bin_counts: List[int]):
@@ -147,7 +163,14 @@ def with_bin_counts(self, bin_counts: List[int]):
         self
             For chaining convenience.
         """
+        if len(bin_counts) != self.ndim:
+            raise ValueError(
+                f"Match function returns {self.ndim} values "
+                f"but provided {len(bin_counts)} bin counts"
+            )
+
         self.bin_counts = bin_counts
+        self.digitizers = None
         return self
 
     def _dotprop_keys(self) -> Sequence[DotpropKey]:
@@ -179,6 +202,7 @@ def _empty_counts(self) -> np.ndarray:
         shape = [len(b) for b in self.digitizers]
         return np.zeros(shape, int)
 
+    @lru_cache
     def _query(self, q_idx, t_idx) -> List[np.ndarray]:
         """Get the results of applying the match function to dotprops specified by indices"""
         return self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx])
@@ -217,7 +241,7 @@ def _count_results(self, results: List[np.ndarray], counts=None):
 
         return counts
 
-    def _counts_array(self, idx_pairs, threads=None):
+    def _counts_array(self, idx_pairs, threads=None, cache=False):
         """Convert index pairs into a digitized counts array.
 
         Requires digitizers.
@@ -242,6 +266,8 @@ def _counts_array(self, idx_pairs, threads=None):
                 chunksize=chunks,
             ):
                 counts += these_counts
+                if not cache:
+                    self._query.cache_clear
 
         return counts
 
@@ -255,20 +281,16 @@ def _pick_nonmatching_pairs(self, n_matching_qual_vals):
         all_nonmatching_pairs = list(self._yield_nonmatching_pairs())
         nonmatching_pairs = []
         n_nonmatching_qual_vals = 0
+        rng = np.random.default_rng(self.seed)
         while n_nonmatching_qual_vals < n_matching_qual_vals:
-            idx = self.rng.integers(0, len(all_nonmatching_pairs))
+            idx = rng.integers(0, len(all_nonmatching_pairs))
             nonmatching_pair = all_nonmatching_pairs.pop(idx)
             nonmatching_pairs.append(nonmatching_pair)
             n_nonmatching_qual_vals += len(self.dotprops[nonmatching_pair[0]])
 
         return nonmatching_pairs
 
-    def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]:
-        if self.digitizers is None and self.bin_counts is None:
-            raise ValueError(
-                "Builder needs either digitizers or bin_counts; see with_* methods"
-            )
-
+    def _get_pairs(self):
         matching_pairs = list(set(self._yield_matching_pairs()))
         # need to know the eventual distdot count
         # so we know how many non-matching pairs to draw
@@ -278,14 +300,26 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]:
         )
 
         nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals)
+        return matching_pairs, nonmatching_pairs
+
+    def _build(self, threads, cache=False) -> Tuple[List[Digitizer], np.ndarray]:
+        if self.digitizers is None and self.bin_counts is None:
+            raise ValueError(
+                "Builder needs either digitizers or bin_counts; see with_* methods"
+            )
+
+        matching_pairs, nonmatching_pairs = self._get_pairs()
+
         if self.digitizers:
-            match_counts = self._counts_array(matching_pairs, threads)
+            match_counts = self._counts_array(matching_pairs, threads, cache)
         else:
             match_results = concat_results(self._query_many(matching_pairs, threads))
             self.digitizers = [
                 Digitizer.from_data(data, nbins)
                 for data, nbins in zip(match_results, self.bin_counts)
             ]
+            if not cache:
+                self._query.cache_clear()
             match_counts = self._count_results(match_results)
 
         nonmatch_counts = self._counts_array(nonmatching_pairs, threads)
@@ -301,7 +335,7 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]:
 
         return self.digitizers, cells
 
-    def build(self, threads=None) -> LookupNd:
+    def build(self, threads=None, cache=False) -> LookupNd:
         """Build the score matrix.
 
         All non-identical neuron pairs within all matching sets are selected,
@@ -324,7 +358,7 @@ def build(self, threads=None) -> LookupNd:
         -------
         LookupNd
         """
-        dig, cells = self._build(threads)
+        dig, cells = self._build(threads, cache)
         return LookupNd(dig, cells)
 
 
@@ -380,13 +414,10 @@ def __init__(
             nonmatching_list,
             seed,
         )
+        self._ndim = 2
 
-    def build(self, threads=None) -> Lookup2d:
-        if (self.digitizers and len(self.digitizers) != 2) or (
-            self.bin_counts and len(self.bin_counts) != 2
-        ):
-            raise ValueError("Expected 2 axes")
-        (dig0, dig1), cells = self._build(threads)
+    def build(self, threads=None, cache=False) -> Lookup2d:
+        (dig0, dig1), cells = self._build(threads, cache)
         return Lookup2d(dig0, dig1, cells)
 
 
@@ -664,6 +695,7 @@ def smat_fcwb(alpha=False):
 
 def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True):
     """Checks functionally that the callable can be used as a score function.
+
     Parameters
     ----------
     nargs : optional int, default 2
@@ -672,6 +704,7 @@ def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True):
         Check that the function can be used on ``nargs`` scalars.
     array : optional bool, default True
         Check that the function can be used on ``nargs`` 1D ``numpy.ndarray``s.
+
     Raises
     ------
     ValueError

From 56d780ce35c2df6f98fd6d0f2202c3db8514c2c8 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Mon, 29 Nov 2021 15:30:39 +0000
Subject: [PATCH 09/21] Switch Blaster.append to return numerical indices

---
 navis/nbl/base.py           |  9 ++++++++-
 navis/nbl/nblast_funcs.py   | 18 ++++++++----------
 navis/nbl/synblast_funcs.py | 13 +++++++------
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/navis/nbl/base.py b/navis/nbl/base.py
index c3ad4203..1b238103 100644
--- a/navis/nbl/base.py
+++ b/navis/nbl/base.py
@@ -17,6 +17,7 @@
 import pandas as pd
 
 from abc import ABC, abstractmethod
+from typing import Union, List
 
 from .. import utils, config
 
@@ -24,6 +25,9 @@
 FLOAT_DTYPES = {16: np.float16, 32: np.float32, 64: np.float64, None: None}
 
 
+AppendOutput = Union[int, List['AppendOutput']]
+
+
 class Blaster(ABC):
     """Base class for blasting."""
 
@@ -37,7 +41,7 @@ def __init__(self, dtype=np.float64, progress=True):
         self.ids = []
 
     @abstractmethod
-    def append(self, neurons):
+    def append(self, neurons) -> AppendOutput:
         """Append neurons."""
         pass
 
@@ -157,3 +161,6 @@ def all_by_all(self, scores='forward'):
             res.loc[:, :] = np.dstack((res, res.T)).max(axis=2)
 
         return res
+
+    def __len__(self):
+        return len(self.neurons)
diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index 32dae6fa..a6b28271 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -16,7 +16,6 @@
 
 import numbers
 import os
-import uuid
 
 import numpy as np
 import pandas as pd
@@ -27,7 +26,7 @@
 
 from .. import utils, config
 from ..core import NeuronList, Dotprops, make_dotprops
-from .base import Blaster
+from .base import Blaster, AppendOutput
 
 __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist']
 
@@ -104,8 +103,6 @@ def parse_interval(self, s):
         return float(s.strip("([])").split(",")[-1])
 
 
-NeuronId = Union[int, str, uuid.UUID]
-
 class NBlaster(Blaster):
     """Implements version 2 of the NBLAST algorithm.
 
@@ -163,12 +160,12 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto',
         else:
             self.distance_upper_bound = limit_dist
 
-    def append(self, dotprops) -> Union[List[NeuronId], NeuronId]:
+    def append(self, dotprops) -> AppendOutput:
         """Append dotprops.
 
-        Returns the ID of the appended dotprops.
+        Returns the numerical index appended dotprops.
         If dotprops is a (possibly nested) sequence of dotprops,
-        return a (possibly nested) list of IDs.
+        return a (possibly nested) list of indices.
         """
         if isinstance(dotprops, Dotprops):
             return self._append_dotprops(dotprops)
@@ -178,12 +175,13 @@ def append(self, dotprops) -> Union[List[NeuronId], NeuronId]:
         except TypeError:  # i.e. not iterable
             raise ValueError(f"Expected Dotprops or iterable thereof; got {type(dotprops)}")
 
-    def _append_dotprops(self, dotprops: Dotprops) -> NeuronId:
+    def _append_dotprops(self, dotprops: Dotprops) -> int:
+        next_id = len(self)
         self.neurons.append(dotprops)
         self.ids.append(dotprops.id)
         # Calculate score for self hit
         self.self_hits.append(self.calc_self_hit(dotprops))
-        return dotprops.id
+        return next_id
 
     def calc_self_hit(self, dotprops):
         """Non-normalized value for self hit."""
@@ -195,7 +193,7 @@ def calc_self_hit(self, dotprops):
             dots = np.repeat(1, len(dotprops.points)) * np.sqrt(alpha)
             return self.score_fn(dists, dots).sum()
 
-    def single_query_target(self, q_idx, t_idx, scores='forward'):
+    def single_query_target(self, q_idx: int, t_idx: int, scores='forward'):
         """Query single target against single target."""
         # Take a short-cut if this is a self-self comparison
         if q_idx == t_idx:
diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py
index c0b7812d..821e617e 100644
--- a/navis/nbl/synblast_funcs.py
+++ b/navis/nbl/synblast_funcs.py
@@ -27,7 +27,7 @@
 from .. import config, utils
 from ..core import NeuronList, BaseNeuron
 
-from .base import Blaster
+from .base import Blaster, AppendOutput
 
 from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction,
                            nblast_preflight)
@@ -84,8 +84,8 @@ def __init__(self, normalized=True, by_type=True,
         self.score_fn = ScoringFunction(smat)
         self.ids = []
 
-    def append(self, neuron, id=None):
-        """Append neurons/connector tables, returning ids of added objects"""
+    def append(self, neuron, id=None) -> AppendOutput:
+        """Append neurons/connector tables, returning numerical indices of added objects"""
         if isinstance(neuron, pd.DataFrame):
             return self._append_connectors(neuron, id)
 
@@ -102,10 +102,11 @@ def append(self, neuron, id=None):
                 f"{type(neuron)}"
             )
 
-    def _append_connectors(self, connectors: pd.DataFrame, id):
+    def _append_connectors(self, connectors: pd.DataFrame, id) -> int:
         if id is None:
             raise ValueError("Explicit non-None id required for appending connectors")
 
+        next_idx = len(self)
         self.ids.append(id)
         self.neurons.append({})
         if not self.by_type:
@@ -123,13 +124,13 @@ def _append_connectors(self, connectors: pd.DataFrame, id):
 
         # Calculate score for self hit
         self.self_hits.append(self.calc_self_hit(connectors))
-        return id
+        return next_idx
 
     def calc_self_hit(self, cn):
         """Non-normalized value for self hit."""
         return cn.shape[0] * self.score_fn(0, 1)
 
-    def single_query_target(self, q_idx, t_idx, scores='forward'):
+    def single_query_target(self, q_idx: int, t_idx: int, scores='forward'):
         """Query single target against single target."""
         # Take a short-cut if this is a self-self comparison
         if q_idx == t_idx:

From f84d02bfcf432c6732fff0080cbd1ecee01ae128 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Mon, 29 Nov 2021 16:25:34 +0000
Subject: [PATCH 10/21] Dotprops.dist_dots type annotation fix

---
 navis/core/dotprop.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/navis/core/dotprop.py b/navis/core/dotprop.py
index c4e5c64e..aaa8175c 100644
--- a/navis/core/dotprop.py
+++ b/navis/core/dotprop.py
@@ -332,7 +332,9 @@ def dist_dots(self,
                   other: 'Dotprops',
                   alpha: bool = False,
                   distance_upper_bound: Optional[float] = None,
-                  **kwargs) -> Tuple[np.ndarray, np.ndarray]:
+                  **kwargs) -> Union[
+                      Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, np.ndarray]
+                    ]:
         """Query this Dotprops against another.
 
         This function is mainly for ``navis.nblast``.

From c2c79acd6620366480e4ac8b1113f294b73e4270 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Mon, 29 Nov 2021 16:34:05 +0000
Subject: [PATCH 11/21] small fixes

---
 navis/nbl/base.py           | 4 ++--
 navis/nbl/nblast_funcs.py   | 4 ++--
 navis/nbl/smat.py           | 2 +-
 navis/nbl/synblast_funcs.py | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/navis/nbl/base.py b/navis/nbl/base.py
index 1b238103..9e51344d 100644
--- a/navis/nbl/base.py
+++ b/navis/nbl/base.py
@@ -25,7 +25,7 @@
 FLOAT_DTYPES = {16: np.float16, 32: np.float32, 64: np.float64, None: None}
 
 
-AppendOutput = Union[int, List['AppendOutput']]
+NestedIndices = Union[int, List['NestedIndices']]
 
 
 class Blaster(ABC):
@@ -41,7 +41,7 @@ def __init__(self, dtype=np.float64, progress=True):
         self.ids = []
 
     @abstractmethod
-    def append(self, neurons) -> AppendOutput:
+    def append(self, neurons) -> NestedIndices:
         """Append neurons."""
         pass
 
diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index a6b28271..f07986e7 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -26,7 +26,7 @@
 
 from .. import utils, config
 from ..core import NeuronList, Dotprops, make_dotprops
-from .base import Blaster, AppendOutput
+from .base import Blaster, NestedIndices
 
 __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist']
 
@@ -160,7 +160,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto',
         else:
             self.distance_upper_bound = limit_dist
 
-    def append(self, dotprops) -> AppendOutput:
+    def append(self, dotprops) -> NestedIndices:
         """Append dotprops.
 
         Returns the numerical index appended dotprops.
diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index f27fd96e..bbb4e5ac 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -36,7 +36,6 @@
 
 epsilon = sys.float_info.epsilon
 cpu_count = max(1, (os.cpu_count() or 2) - 1)
-IMPLICIT_INTERVAL = "[)"
 
 fp = Path(__file__).resolve().parent
 smat_path = fp / "score_mats"
@@ -212,6 +211,7 @@ def _query_many(self, idx_pairs, threads=None) -> Iterator[List[np.ndarray]]:
         if threads is None or threads == 0 and cpu_count == 1:
             for q_idx, t_idx in idx_pairs:
                 yield self._query(q_idx, t_idx)
+            return
 
         threads = threads or cpu_count
         idx_pairs = np.asarray(idx_pairs)
diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py
index 821e617e..2bbb9d54 100644
--- a/navis/nbl/synblast_funcs.py
+++ b/navis/nbl/synblast_funcs.py
@@ -27,7 +27,7 @@
 from .. import config, utils
 from ..core import NeuronList, BaseNeuron
 
-from .base import Blaster, AppendOutput
+from .base import Blaster, NestedIndices
 
 from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction,
                            nblast_preflight)
@@ -84,7 +84,7 @@ def __init__(self, normalized=True, by_type=True,
         self.score_fn = ScoringFunction(smat)
         self.ids = []
 
-    def append(self, neuron, id=None) -> AppendOutput:
+    def append(self, neuron, id=None) -> NestedIndices:
         """Append neurons/connector tables, returning numerical indices of added objects"""
         if isinstance(neuron, pd.DataFrame):
             return self._append_connectors(neuron, id)

From 57c557f5e6c8bb8c27f271e9faeed4674a4a4b2d Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Mon, 29 Nov 2021 17:11:27 +0000
Subject: [PATCH 12/21] smat: write out unclipped bounds

---
 navis/nbl/smat.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index bbb4e5ac..86a8d836 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -466,14 +466,17 @@ def __init__(
         self.right = right
 
         boundaries = list(boundaries)
-        self._min = boundaries[0]
+        self._min = -math.inf
         if clip[0]:
+            self._min = boundaries[0]
             boundaries[0] = -math.inf
         elif boundaries[0] != -math.inf:
+            self._min = -math.inf
             boundaries.insert(0, -math.inf)
 
-        self._max = boundaries[-1]
+        self._max = math.inf
         if clip[1]:
+            self._max = boundaries[-1]
             boundaries[-1] = math.inf
         elif boundaries[-1] != math.inf:
             boundaries.append(math.inf)
@@ -503,9 +506,12 @@ def to_strings(self) -> List[str]:
             lb = "["
             rb = ")"
 
+        b = self.boundaries.copy()
+        b[0] = self._min
+        b[-1] = self._max
         return [
             f"{lb}{lower},{upper}{rb}"
-            for lower, upper in zip(self.boundaries[:-1], self.boundaries[1:])
+            for lower, upper in zip(b[:-1], b[1:])
         ]
 
     @classmethod

From 8fbee15e9e403438f86026fe953f7a7b4309308b Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Mon, 29 Nov 2021 17:13:28 +0000
Subject: [PATCH 13/21] incorporate smat.lookup2d into blasting functions

---
 navis/nbl/nblast_funcs.py   | 41 ++++++++++++++++++++++++-------------
 navis/nbl/synblast_funcs.py | 25 +++++++++++++++-------
 2 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index f07986e7..e5a7e386 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -16,6 +16,7 @@
 
 import numbers
 import os
+import operator
 
 import numpy as np
 import pandas as pd
@@ -24,6 +25,8 @@
 from typing import Union, Optional, List
 from typing_extensions import Literal
 
+from navis.nbl.smat import Lookup2d, smat_fcwb
+
 from .. import utils, config
 from ..core import NeuronList, Dotprops, make_dotprops
 from .base import Blaster, NestedIndices
@@ -119,10 +122,16 @@ class NBlaster(Blaster):
     normalized :    bool
                     If True, will normalize scores by the best possible score
                     (i.e. self-self) of the query neuron.
-    smat :          str | pd.DataFrame
-                    Score matrix. If 'auto' (default), will use scoring matrices
+    smat :          navis.nbl.smat.Lookup2d | pd.DataFrame | str
+                    How to convert the point match pairs into an NBLAST score,
+                    usually by a lookup table.
+                    If 'auto' (default), will use scoring matrices
                     from FCWB. Same behaviour as in R's nat.nblast
-                    implementation. If ``smat=None`` the scores will be
+                    implementation.
+                    Dataframes will be used to build a ``Lookup2d``.
+                    If ``limit_dist`` is not given,
+                    will attempt to infer from the first axis of the lookup table.
+                    If ``smat=None`` the scores will be
                     generated as the product of the distances and the dotproduct
                     of the vectors of nearest-neighbor pairs.
     limit_dist :    float | "auto" | None
@@ -145,20 +154,24 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto',
         self.approx_nn = approx_nn
         self.desc = "NBlasting"
 
-        if smat == 'auto':
-            if self.use_alpha:
-                smat = pd.read_csv(f'{smat_path}/smat_alpha_fcwb.csv',
-                                   index_col=0)
-            else:
-                smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv',
-                                   index_col=0)
+        if smat is None:
+            self.score_fn = operator.mul
+        elif smat == 'auto':
+            self.score_fn = smat_fcwb(self.use_alpha)
+        elif isinstance(smat, pd.DataFrame):
+            self.score_fn = Lookup2d.from_dataframe(smat)
+        else:
+            self.score_fn = smat
 
-        self.score_fn = ScoringFunction(smat)
+        self.distance_upper_bound = None
 
-        if limit_dist == 'auto':
-            self.distance_upper_bound = self.score_fn.max_dist
-        else:
+        if limit_dist is not None:
             self.distance_upper_bound = limit_dist
+        else:
+            try:
+                self.distance_upper_bound = self.score_fn.digitizers[0]._max
+            except AttributeError:
+                pass
 
     def append(self, dotprops) -> NestedIndices:
         """Append dotprops.
diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py
index 2bbb9d54..eec20c1c 100644
--- a/navis/nbl/synblast_funcs.py
+++ b/navis/nbl/synblast_funcs.py
@@ -15,6 +15,7 @@
 """Module contains functions implementing SyNBLAST."""
 
 import os
+import operator
 
 import numpy as np
 import pandas as pd
@@ -28,9 +29,10 @@
 from ..core import NeuronList, BaseNeuron
 
 from .base import Blaster, NestedIndices
+from .smat import Lookup2d
 
 from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction,
-                           nblast_preflight)
+                           nblast_preflight, smat_fcwb)
 
 __all__ = ['synblast']
 
@@ -59,10 +61,14 @@ class SynBlaster(Blaster):
     by_type :       bool
                     If True will only compare synapses with the same value in
                     the "type" column.
-    smat :          str | pd.DataFrame
-                    Score matrix. If 'auto' (default), will use scoring matrices
+    smat :          navis.nbl.smat.Lookup2d | pd.DataFrame | str
+                    How to convert the point match pairs into an NBLAST score,
+                    usually by a lookup table.
+                    If 'auto' (default), will use scoring matrices
                     from FCWB. Same behaviour as in R's nat.nblast
-                    implementation. If ``smat=None`` the scores will be
+                    implementation.
+                    Dataframes will be used to build a ``Lookup2d``.
+                    If ``smat=None`` the scores will be
                     generated as the product of the distances and the dotproduct
                     of the vectors of nearest-neighbor pairs.
     progress :      bool
@@ -77,9 +83,14 @@ def __init__(self, normalized=True, by_type=True,
         self.normalized = normalized
         self.by_type = by_type
 
-        if smat == 'auto':
-            smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv',
-                               index_col=0)
+        if smat is None:
+            self.score_fn = operator.mul
+        elif smat == 'auto':
+            self.score_fn = smat_fcwb()
+        elif isinstance(smat, pd.DataFrame):
+            self.score_fn = Lookup2d.from_dataframe(smat)
+        else:
+            self.score_fn = smat
 
         self.score_fn = ScoringFunction(smat)
         self.ids = []

From a7f53495484a54d11d1c642d1f1333fed5714c84 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Tue, 30 Nov 2021 14:15:04 +0000
Subject: [PATCH 14/21] Lookup2d docs

---
 navis/nbl/smat.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 86a8d836..63982a51 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -381,7 +381,9 @@ def __init__(
         seed: int = DEFAULT_SEED,
     ):
         f"""Class for building a 2-dimensional score lookup for NBLAST.
+
         The scores are
+
         1. The distances between best-matching points
         2. The dot products of direction vectors around those points,
             optionally scaled by the colinearity ``alpha``.
@@ -658,9 +660,33 @@ class Lookup2d(LookupNd):
     """
 
     def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarray):
+        """2D lookup table for convert NBLAST matches to scores.
+
+        Commonly read from a ``pandas.DataFrame``
+        or trained on data using a ``LookupDistDotBuilder``.
+
+        Parameters
+        ----------
+        digitizer0 : Digitizer
+            How to convert continuous values into an index for the first axis.
+        digitizer1 : Digitizer
+            How to convert continuous values into an index for the second axis.
+        cells : np.ndarray
+            Values to look up in the table.
+        """
         super().__init__([digitizer0, digitizer1], cells)
 
     def to_dataframe(self) -> pd.DataFrame:
+        """Convert the lookup table into a ``pandas.DataFrame``.
+
+        From there, it can be shared, saved, and so on.
+
+        The index and column labels describe the intervals represented by that axis.
+
+        Returns
+        -------
+        pd.DataFrame
+        """
         return pd.DataFrame(
             self.cells,
             self.digitizers[0].to_strings(),

From bf00b55d17a7a4cb407cb2ac7ca80652bc622823 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Tue, 30 Nov 2021 14:15:39 +0000
Subject: [PATCH 15/21] Smat building tutorial

---
 docs/source/api.rst              |   9 +
 docs/source/gallery.rst          |   1 +
 docs/source/tutorials/smat.ipynb | 491 +++++++++++++++++++++++++++++++
 3 files changed, 501 insertions(+)
 create mode 100644 docs/source/tutorials/smat.ipynb

diff --git a/docs/source/api.rst b/docs/source/api.rst
index 4161dfbe..d82b4cc6 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -364,6 +364,15 @@ Functions to compare morphology.
     navis.synblast
     navis.persistence_distances
 
+Utilities for creating your own score matrices for NBLAST can be found in
+
+.. autosummary::
+    :toctree: generated/
+
+    navis.nbl.smat.Lookup2d
+    navis.nbl.smat.Digitizer
+    navis.nbl.smat.LookupDistDotBuilder
+
 Polarity metrics
 ----------------
 .. autosummary::
diff --git a/docs/source/gallery.rst b/docs/source/gallery.rst
index 62a61f4b..4da347ad 100644
--- a/docs/source/gallery.rst
+++ b/docs/source/gallery.rst
@@ -72,6 +72,7 @@ function. There you will also find more examples.
    tutorials/nblast
    tutorials/nblast_flycircuit
    tutorials/nblast_hemibrain
+   tutorials/smat
 
 .. raw:: html
 
diff --git a/docs/source/tutorials/smat.ipynb b/docs/source/tutorials/smat.ipynb
new file mode 100644
index 00000000..45672bb2
--- /dev/null
+++ b/docs/source/tutorials/smat.ipynb
@@ -0,0 +1,491 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    ".. _smat_intro:\n",
+    "\n",
+    "NBLAST score matrix generation\n",
+    "******************************\n",
+    "\n",
+    "NBLAST calculates the similarity between neurons' morphology.\n",
+    "For more information on NBLAST in ``navis``, see the other tutorials.\n",
+    "\n",
+    "The core of the algorithm is the function which converts point matches\n",
+    "(a distance and a dot product) into a score\n",
+    "expressing how likely they are to have come from the same point cloud.\n",
+    "This is generally a 2D lookup table, referred to as the score matrix,\n",
+    "generated by \"training\" it on neurons known to be matching or non-matching.\n",
+    "\n",
+    "``navis`` provides (and uses by default) the score matrix used in the original publication (Costa et al., 2016).\n",
+    "This works quite well in many cases.\n",
+    "However, how appropriate it is for your data depends on a number of factors:\n",
+    "\n",
+    "* How big your neurons are (commonly addressed by scaling the distance axis of the built-in score matrix)\n",
+    "* How you have pre-processed your neurons (pruning dendrites, resampling etc.)\n",
+    "* Your actual task (matching left-right pairs, finding lineages etc.)\n",
+    "* How distinct you expect your matches and non-matches to be (e.g. how large a body volume you're drawing neurons from)\n",
+    "\n",
+    "Utilities in ``navis.nbl.smat`` allow you to train your own score matrix."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import navis\n",
+    "from navis.nbl.smat import Lookup2d, LookupDistDotBuilder, Digitizer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "``Lookup2d`` is the lookup table, and can be given to any nblast-related class or function.\n",
+    "Each axis represented by a ``Digitizer``, which converts continuous values into discrete indices.\n",
+    "These indices are used to look up score values in an array.\n",
+    "\n",
+    "The ``LookupDistDotBuilder`` is a class which generates ``Lookup2d`` instances from training data.\n",
+    "\n",
+    "First, we need some training data.\n",
+    "Let's augment our set of example neurons by randomly mutating them:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# use a replicable random number generator\n",
+    "RNG = np.random.default_rng(2021)\n",
+    "\n",
+    "\n",
+    "def augment_neuron(\n",
+    "    nrn: navis.TreeNeuron, scale_sigma=0.1, translation_sigma=50, jitter_sigma=10\n",
+    "):\n",
+    "    nrn = nrn.copy(deepcopy=True)\n",
+    "    nrn.name += \"_aug\"\n",
+    "    dims = list(\"xyz\")\n",
+    "    coords = nrn.nodes[dims].to_numpy()\n",
+    "\n",
+    "    # translate whole neuron\n",
+    "    coords += RNG.normal(scale=translation_sigma, size=coords.shape[-1])\n",
+    "    # jitter individual coordinates\n",
+    "    coords += RNG.normal(scale=jitter_sigma, size=coords.shape)\n",
+    "    # rescale\n",
+    "    mean = np.mean(coords, axis=0)\n",
+    "    coords -= mean\n",
+    "    coords *= RNG.normal(loc=1.0, scale=scale_sigma)\n",
+    "    coords += mean\n",
+    "\n",
+    "    nrn.nodes[dims] = coords\n",
+    "    return nrn\n",
+    "\n",
+    "\n",
+    "original = list(navis.example_neurons())\n",
+    "jittered = [augment_neuron(n) for n in original]\n",
+    "\n",
+    "dotprops = [navis.make_dotprops(n, k=5, resample=False) for n in original + jittered]\n",
+    "matching_pairs = [[idx, idx + len(original)] for idx in range(len(original))]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The score matrix builder needs some neurons as a list of ``navis.Dotprops`` objects,\n",
+    "and then to know which neurons should match with each other as indices into that list.\n",
+    "It's assumed that matches are relatively rare among the total set of possible pairings,\n",
+    "so non-matching pairs are drawn randomly (although a non-matching list can be given explicitly).\n",
+    "\n",
+    "Then it needs to know where to draw the boundaries between bins in the output lookup table.\n",
+    "These can be given explicitly as a list of 2 ``Digitizer`` s,\n",
+    "or can be inferred from the data: bins will be drawn to evenly partition the matching neuron scores.\n",
+    "\n",
+    "The resulting ``Lookup2d`` can be imported/exported as a ``pandas.DataFrame`` for ease of viewing and storing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>[0.0,0.14588644700379452)</th>\n",
+       "      <th>[0.14588644700379452,0.29912252023057806)</th>\n",
+       "      <th>[0.29912252023057806,0.4802508950780032)</th>\n",
+       "      <th>[0.4802508950780032,0.7351365037506351)</th>\n",
+       "      <th>[0.7351365037506351,0.9988406500803395)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>[2.0140850483155233,57.8493661406481)</th>\n",
+       "      <td>0.486661</td>\n",
+       "      <td>0.379042</td>\n",
+       "      <td>0.387260</td>\n",
+       "      <td>0.445648</td>\n",
+       "      <td>0.855775</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>[57.8493661406481,81.31283353333698)</th>\n",
+       "      <td>0.404766</td>\n",
+       "      <td>0.343702</td>\n",
+       "      <td>0.406445</td>\n",
+       "      <td>0.417802</td>\n",
+       "      <td>0.799595</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>[81.31283353333698,104.08576202392578)</th>\n",
+       "      <td>0.257971</td>\n",
+       "      <td>0.311827</td>\n",
+       "      <td>0.217626</td>\n",
+       "      <td>0.248743</td>\n",
+       "      <td>1.013926</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>[104.08576202392578,128.14104591262304)</th>\n",
+       "      <td>0.255728</td>\n",
+       "      <td>0.089663</td>\n",
+       "      <td>0.171599</td>\n",
+       "      <td>0.115997</td>\n",
+       "      <td>1.296510</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>[128.14104591262304,155.36119651794434)</th>\n",
+       "      <td>-0.136171</td>\n",
+       "      <td>-0.107249</td>\n",
+       "      <td>-0.125751</td>\n",
+       "      <td>0.252883</td>\n",
+       "      <td>1.987175</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>[155.36119651794434,202.6728515625)</th>\n",
+       "      <td>-0.575078</td>\n",
+       "      <td>-0.448307</td>\n",
+       "      <td>-0.475147</td>\n",
+       "      <td>-0.221016</td>\n",
+       "      <td>1.407061</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>[202.6728515625,395.9569088293945)</th>\n",
+       "      <td>-1.025938</td>\n",
+       "      <td>-0.948679</td>\n",
+       "      <td>-0.863801</td>\n",
+       "      <td>-0.620512</td>\n",
+       "      <td>0.054148</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>[395.9569088293945,4709.61474609375)</th>\n",
+       "      <td>-0.615558</td>\n",
+       "      <td>-0.737251</td>\n",
+       "      <td>-0.679764</td>\n",
+       "      <td>-0.454779</td>\n",
+       "      <td>-0.197384</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                         [0.0,0.14588644700379452)  \\\n",
+       "[2.0140850483155233,57.8493661406481)                     0.486661   \n",
+       "[57.8493661406481,81.31283353333698)                      0.404766   \n",
+       "[81.31283353333698,104.08576202392578)                    0.257971   \n",
+       "[104.08576202392578,128.14104591262304)                   0.255728   \n",
+       "[128.14104591262304,155.36119651794434)                  -0.136171   \n",
+       "[155.36119651794434,202.6728515625)                      -0.575078   \n",
+       "[202.6728515625,395.9569088293945)                       -1.025938   \n",
+       "[395.9569088293945,4709.61474609375)                     -0.615558   \n",
+       "\n",
+       "                                         [0.14588644700379452,0.29912252023057806)  \\\n",
+       "[2.0140850483155233,57.8493661406481)                                     0.379042   \n",
+       "[57.8493661406481,81.31283353333698)                                      0.343702   \n",
+       "[81.31283353333698,104.08576202392578)                                    0.311827   \n",
+       "[104.08576202392578,128.14104591262304)                                   0.089663   \n",
+       "[128.14104591262304,155.36119651794434)                                  -0.107249   \n",
+       "[155.36119651794434,202.6728515625)                                      -0.448307   \n",
+       "[202.6728515625,395.9569088293945)                                       -0.948679   \n",
+       "[395.9569088293945,4709.61474609375)                                     -0.737251   \n",
+       "\n",
+       "                                         [0.29912252023057806,0.4802508950780032)  \\\n",
+       "[2.0140850483155233,57.8493661406481)                                    0.387260   \n",
+       "[57.8493661406481,81.31283353333698)                                     0.406445   \n",
+       "[81.31283353333698,104.08576202392578)                                   0.217626   \n",
+       "[104.08576202392578,128.14104591262304)                                  0.171599   \n",
+       "[128.14104591262304,155.36119651794434)                                 -0.125751   \n",
+       "[155.36119651794434,202.6728515625)                                     -0.475147   \n",
+       "[202.6728515625,395.9569088293945)                                      -0.863801   \n",
+       "[395.9569088293945,4709.61474609375)                                    -0.679764   \n",
+       "\n",
+       "                                         [0.4802508950780032,0.7351365037506351)  \\\n",
+       "[2.0140850483155233,57.8493661406481)                                   0.445648   \n",
+       "[57.8493661406481,81.31283353333698)                                    0.417802   \n",
+       "[81.31283353333698,104.08576202392578)                                  0.248743   \n",
+       "[104.08576202392578,128.14104591262304)                                 0.115997   \n",
+       "[128.14104591262304,155.36119651794434)                                 0.252883   \n",
+       "[155.36119651794434,202.6728515625)                                    -0.221016   \n",
+       "[202.6728515625,395.9569088293945)                                     -0.620512   \n",
+       "[395.9569088293945,4709.61474609375)                                   -0.454779   \n",
+       "\n",
+       "                                         [0.7351365037506351,0.9988406500803395)  \n",
+       "[2.0140850483155233,57.8493661406481)                                   0.855775  \n",
+       "[57.8493661406481,81.31283353333698)                                    0.799595  \n",
+       "[81.31283353333698,104.08576202392578)                                  1.013926  \n",
+       "[104.08576202392578,128.14104591262304)                                 1.296510  \n",
+       "[128.14104591262304,155.36119651794434)                                 1.987175  \n",
+       "[155.36119651794434,202.6728515625)                                     1.407061  \n",
+       "[202.6728515625,395.9569088293945)                                      0.054148  \n",
+       "[395.9569088293945,4709.61474609375)                                   -0.197384  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "builder = LookupDistDotBuilder(\n",
+    "    dotprops, matching_pairs, use_alpha=True, seed=2021\n",
+    ").with_bin_counts([8, 5])\n",
+    "smat = builder.build()\n",
+    "as_table = smat.to_dataframe()\n",
+    "as_table"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we can have this score matrix, we can use it for a problem which can be solved by NBLAST:\n",
+    "we've mixed up a bag of neurons which look very similar to some of our examples,\n",
+    "and need to know which they match with."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "NBLAST is optimized for data in microns and it looks like your queries are not in microns.\n",
+      "NBLAST is optimized for data in microns and it looks like your targets are not in microns.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b55a73a0832e495596df888b4e7d879d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Preparing:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " "
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3a5c8d6bb2d54fd99d395e3ed492d37a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "NBlasting:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>754538881_aug</th>\n",
+       "      <th>722817260_aug</th>\n",
+       "      <th>1734350908_aug</th>\n",
+       "      <th>1734350788_aug</th>\n",
+       "      <th>754534424_aug</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1734350788</th>\n",
+       "      <td>-0.271504</td>\n",
+       "      <td>-0.361381</td>\n",
+       "      <td>-0.272018</td>\n",
+       "      <td>0.159048</td>\n",
+       "      <td>-0.325920</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1734350908</th>\n",
+       "      <td>-0.400432</td>\n",
+       "      <td>-0.491376</td>\n",
+       "      <td>0.858478</td>\n",
+       "      <td>-0.437819</td>\n",
+       "      <td>-0.216206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>722817260</th>\n",
+       "      <td>-0.157433</td>\n",
+       "      <td>0.127931</td>\n",
+       "      <td>-0.365195</td>\n",
+       "      <td>-0.407436</td>\n",
+       "      <td>-0.393940</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>754534424</th>\n",
+       "      <td>-0.237769</td>\n",
+       "      <td>-0.413532</td>\n",
+       "      <td>-0.193794</td>\n",
+       "      <td>-0.401148</td>\n",
+       "      <td>0.857898</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>754538881</th>\n",
+       "      <td>-0.021153</td>\n",
+       "      <td>-0.213349</td>\n",
+       "      <td>-0.200214</td>\n",
+       "      <td>-0.303390</td>\n",
+       "      <td>-0.115017</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            754538881_aug  722817260_aug  1734350908_aug  1734350788_aug  \\\n",
+       "1734350788      -0.271504      -0.361381       -0.272018        0.159048   \n",
+       "1734350908      -0.400432      -0.491376        0.858478       -0.437819   \n",
+       "722817260       -0.157433       0.127931       -0.365195       -0.407436   \n",
+       "754534424       -0.237769      -0.413532       -0.193794       -0.401148   \n",
+       "754538881       -0.021153      -0.213349       -0.200214       -0.303390   \n",
+       "\n",
+       "            754534424_aug  \n",
+       "1734350788      -0.325920  \n",
+       "1734350908      -0.216206  \n",
+       "722817260       -0.393940  \n",
+       "754534424        0.857898  \n",
+       "754538881       -0.115017  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "original_dps = dotprops[:len(original)]\n",
+    "new_dps = [\n",
+    "    navis.make_dotprops(\n",
+    "        augment_neuron(n),\n",
+    "        k=5,\n",
+    "        resample=False\n",
+    "    )\n",
+    "    for n in original\n",
+    "]\n",
+    "RNG.shuffle(new_dps)\n",
+    "\n",
+    "result = navis.nblast(\n",
+    "    original_dps, new_dps,\n",
+    "    use_alpha=True, scores=\"mean\", normalized=True,\n",
+    "    smat=smat,\n",
+    "    n_cores=1,\n",
+    ")\n",
+    "result.index = [dp.name for dp in original_dps]\n",
+    "result.columns = [dp.name for dp in new_dps]\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can see that while there aren't any particularly good scores\n",
+    "(this is a very small amount of training data, and one would normally preprocess the neurons),\n",
+    "in each case the original's best match is its augmented partner."
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "97618690babd5eee2c893391179393f7dcc498027dfffda6cb7b8d10b95474fd"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.4 64-bit ('navis': pyenv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 60c26dd4960d516a28ed0d9da12d2d2870a804ff Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Tue, 30 Nov 2021 14:19:46 +0000
Subject: [PATCH 16/21] fix lru cache

---
 navis/nbl/smat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 63982a51..2c6f07e6 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -201,7 +201,7 @@ def _empty_counts(self) -> np.ndarray:
         shape = [len(b) for b in self.digitizers]
         return np.zeros(shape, int)
 
-    @lru_cache
+    @lru_cache(None)
     def _query(self, q_idx, t_idx) -> List[np.ndarray]:
         """Get the results of applying the match function to dotprops specified by indices"""
         return self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx])

From 9246cfbdb9e57719a3ce064cd003757f1e45960f Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Tue, 30 Nov 2021 14:27:17 +0000
Subject: [PATCH 17/21] Remove ScoreFunction references

---
 navis/nbl/nblast_funcs.py   | 67 -------------------------------------
 navis/nbl/synblast_funcs.py |  3 +-
 2 files changed, 1 insertion(+), 69 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index e5a7e386..f1f5463e 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -39,73 +39,6 @@
 logger = config.logger
 
 
-class ScoringFunction:
-    """Class representing scoring function."""
-
-    def __init__(self, smat):
-        if isinstance(smat, type(None)):
-            self.scoring_function = self.pass_through
-        elif isinstance(smat, (pd.DataFrame, str)):
-            self.parse_matrix(smat)
-            self.scoring_function = self.score_lookup
-        else:
-            raise TypeError
-
-    def __call__(self, dist, dot):
-        return self.scoring_function(dist, dot)
-
-    @property
-    def max_dist(self):
-        """Max distance considered by the scoring matrix.
-
-        Returns ``None`` if pass through.
-        """
-        if self.scoring_function == self.pass_through:
-            return None
-        # The last bin is always `np.inf`, so we need the second last bin
-        return self.dist_bins[-2]
-
-    def pass_through(self, dist, dot):
-        """Pass-through scores if no scoring matrix."""
-        return dist * dot
-
-    def score_lookup(self, dist, dot):
-        return self.cells[
-                          np.digitize(dist, self.dist_bins),
-                          np.digitize(dot, self.dot_bins),
-                         ]
-
-    def parse_matrix(self, smat):
-        """Parse matrix."""
-        if isinstance(smat, str):
-            smat = pd.read_csv(smat, index_col=0)
-
-        if not isinstance(smat, pd.DataFrame):
-            raise TypeError(f'Excepted filepath or DataFrame, got "{type(smat)}"')
-
-        self.cells = smat.to_numpy()
-
-        self.dist_thresholds = [self.parse_interval(s) for s in smat.index]
-        # Make sure right bin is open
-        self.dist_thresholds[-1] = np.inf
-        self.dist_bins = np.array(self.dist_thresholds, float)
-
-        self.dot_thresholds = [self.parse_interval(s) for s in smat.columns]
-        # Make sure right bin is open
-        self.dot_thresholds[-1] = np.inf
-        self.dot_bins = np.array(self.dot_thresholds, float)
-
-    def parse_interval(self, s):
-        """Strip brackets and parse right interval.
-
-        Example
-        -------
-        >>> parse_intervals("(0,0.1]")                          # doctest: +SKIP
-        0.1
-        """
-        return float(s.strip("([])").split(",")[-1])
-
-
 class NBlaster(Blaster):
     """Implements version 2 of the NBLAST algorithm.
 
diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py
index eec20c1c..433039ea 100644
--- a/navis/nbl/synblast_funcs.py
+++ b/navis/nbl/synblast_funcs.py
@@ -31,7 +31,7 @@
 from .base import Blaster, NestedIndices
 from .smat import Lookup2d
 
-from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction,
+from .nblast_funcs import (check_microns, find_optimal_partition,
                            nblast_preflight, smat_fcwb)
 
 __all__ = ['synblast']
@@ -92,7 +92,6 @@ def __init__(self, normalized=True, by_type=True,
         else:
             self.score_fn = smat
 
-        self.score_fn = ScoringFunction(smat)
         self.ids = []
 
     def append(self, neuron, id=None) -> NestedIndices:

From 62a1daa5d76de8420bd1c6ed9fe9fa6db969712c Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Tue, 30 Nov 2021 14:49:04 +0000
Subject: [PATCH 18/21] fix limit_dist test

---
 navis/nbl/nblast_funcs.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index f1f5463e..8fc24384 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -96,15 +96,14 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto',
         else:
             self.score_fn = smat
 
-        self.distance_upper_bound = None
-
-        if limit_dist is not None:
-            self.distance_upper_bound = limit_dist
-        else:
+        if limit_dist == "auto":
             try:
                 self.distance_upper_bound = self.score_fn.digitizers[0]._max
             except AttributeError:
-                pass
+                logger.warning("Could not infer distance upper bound from scoring function")
+                self.distance_upper_bound = None
+        else:
+            self.distance_upper_bound = limit_dist
 
     def append(self, dotprops) -> NestedIndices:
         """Append dotprops.

From e3ccc4d937117186f8774a9cae59221aa37d33ca Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Tue, 7 Dec 2021 11:12:15 +0000
Subject: [PATCH 19/21] Add SimpleLookup for non-float cases

---
 navis/nbl/nblast_funcs.py   |  2 +-
 navis/nbl/smat.py           | 85 +++++++++++++++++++++++++++++++------
 tests/test_nbl/test_smat.py |  2 +-
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index 8fc24384..9bf1ffe1 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -98,7 +98,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto',
 
         if limit_dist == "auto":
             try:
-                self.distance_upper_bound = self.score_fn.digitizers[0]._max
+                self.distance_upper_bound = self.score_fn.axes[0]._max
             except AttributeError:
                 logger.warning("Could not infer distance upper bound from scoring function")
                 self.distance_upper_bound = None
diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 2c6f07e6..8aa9ee8c 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
+from abc import ABC, abstractmethod
 from itertools import permutations
 import sys
 import os
 from collections import Counter
 from concurrent.futures import ProcessPoolExecutor
 from typing import (
+    Generic,
     Hashable,
     Iterator,
     Mapping,
@@ -15,6 +17,7 @@
     Iterable,
     Any,
     Tuple,
+    TypeVar,
     Union,
 )
 import logging
@@ -443,14 +446,70 @@ def parse_boundary(item: str):
     return tuple(float(i) for i in item[1:-1].split(",")), right
 
 
-class Digitizer:
+T = TypeVar("T")
+
+
+class LookupAxis(ABC, Generic[T]):
+    """Class converting some data into a linear index."""
+
+    @abstractmethod
+    def __len__(self) -> int:
+        """Number of bins represented by this instance."""
+        pass
+
+    @abstractmethod
+    def __call__(self, value: Union[T, Sequence[T]]) -> Union[int, Sequence[int]]:
+        """Convert some data into a linear index.
+
+        Parameters
+        ----------
+        value : Union[T, Sequence[T]]
+            Value to convert into an index
+
+        Returns
+        -------
+        Union[int, Sequence[int]]
+            If a scalar was given, return a scalar; otherwise, a numpy array of ints.
+        """
+        pass
+
+
+class SimpleLookup(LookupAxis[Hashable]):
+    def __init__(self, items: List[Hashable]):
+        """Look up in a list of items and return their index.
+
+        Parameters
+        ----------
+        items : List[Hashable]
+            The item's position in the list is the index which will be returned.
+
+        Raises
+        ------
+        ValueError
+            items are non-unique.
+        """
+        self.items = {item: idx for idx, item in enumerate(items)}
+        if len(self.items) != len(items):
+            raise ValueError("Items are not unique")
+
+    def __len__(self) -> int:
+        return len(self.items)
+
+    def __call__(self, value: Union[Hashable, Sequence[Hashable]]) -> Union[int, Sequence[int]]:
+        if np.isscalar(value):
+            return self.items[value]
+        else:
+            return np.array([self.items[v] for v in value], int)
+
+
+class Digitizer(LookupAxis[float]):
     def __init__(
         self,
         boundaries: Sequence[float],
         clip: Tuple[bool, bool] = (True, True),
         right=False,
     ):
-        """Class converting continuous values into discrete indices given specific bin boundaries.
+        """Class converting continuous values into discrete indices.
 
         Parameters
         ----------
@@ -637,29 +696,29 @@ def __eq__(self, other: object) -> bool:
 
 
 class LookupNd:
-    def __init__(self, digitizers: List[Digitizer], cells: np.ndarray):
-        if [len(b) for b in digitizers] != list(cells.shape):
+    def __init__(self, axes: List[LookupAxis], cells: np.ndarray):
+        if [len(b) for b in axes] != list(cells.shape):
             raise ValueError("boundaries and cells have inconsistent bin counts")
-        self.digitizers = digitizers
+        self.axes = axes
         self.cells = cells
 
     def __call__(self, *args):
-        if len(args) != len(self.digitizers):
+        if len(args) != len(self.axes):
             raise TypeError(
-                f"Lookup takes {len(self.digitizers)} arguments but {len(args)} were given"
+                f"Lookup takes {len(self.axes)} arguments but {len(args)} were given"
             )
 
-        idxs = tuple(d(arg) for d, arg in zip(self.digitizers, args))
+        idxs = tuple(d(arg) for d, arg in zip(self.axes, args))
         out = self.cells[idxs]
         return out
 
 
 class Lookup2d(LookupNd):
-    """Convenience class inheriting from LookupNd for the common 2D case.
+    """Convenience class inheriting from LookupNd for the common 2D float case.
     Provides IO with pandas DataFrames.
     """
 
-    def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarray):
+    def __init__(self, axis0: Digitizer, axis1: Digitizer, cells: np.ndarray):
         """2D lookup table for convert NBLAST matches to scores.
 
         Commonly read from a ``pandas.DataFrame``
@@ -674,7 +733,7 @@ def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarr
         cells : np.ndarray
             Values to look up in the table.
         """
-        super().__init__([digitizer0, digitizer1], cells)
+        super().__init__([axis0, axis1], cells)
 
     def to_dataframe(self) -> pd.DataFrame:
         """Convert the lookup table into a ``pandas.DataFrame``.
@@ -689,8 +748,8 @@ def to_dataframe(self) -> pd.DataFrame:
         """
         return pd.DataFrame(
             self.cells,
-            self.digitizers[0].to_strings(),
-            self.digitizers[1].to_strings(),
+            self.axes[0].to_strings(),
+            self.axes[1].to_strings(),
         )
 
     @classmethod
diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py
index 76d7f9fb..04d2d4f6 100644
--- a/tests/test_nbl/test_smat.py
+++ b/tests/test_nbl/test_smat.py
@@ -87,7 +87,7 @@ def test_lookup2d_roundtrip():
     df = lookup.to_dataframe()
     lookup2 = Lookup2d.from_dataframe(df)
     assert np.allclose(lookup.cells, lookup2.cells)
-    for b1, b2 in zip(lookup.digitizers, lookup2.digitizers):
+    for b1, b2 in zip(lookup.axes, lookup2.axes):
         assert b1 == b2
 
 

From 62b80943f38fdd236443e9d0ce457e4e818eff74 Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Wed, 9 Feb 2022 13:10:42 +0000
Subject: [PATCH 20/21] WIP units

---
 navis/config.py           |  5 +-
 navis/nbl/nblast_funcs.py | 29 +++++++++---
 navis/nbl/smat.py         | 83 ++++++++++++++++++++++++++++-----
 navis/units.py            | 97 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 194 insertions(+), 20 deletions(-)
 create mode 100644 navis/units.py

diff --git a/navis/config.py b/navis/config.py
index dd380d17..8b802906 100644
--- a/navis/config.py
+++ b/navis/config.py
@@ -17,6 +17,8 @@
 
 import matplotlib as mpl
 
+from .units import ureg  # noqa: F401
+
 logger = logging.getLogger('navis')
 
 
@@ -69,9 +71,6 @@ def remove_log_handlers():
 # Default color for neurons
 default_color = (.95, .65, .04)
 
-# Unit registry
-ureg = pint.UnitRegistry()
-
 # Set to true to prevent Viewer from ever showing
 headless = os.environ.get('NAVIS_HEADLESS', 'False').lower() == 'true'
 if headless:
diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index 9bf1ffe1..11cde829 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -22,14 +22,18 @@
 import pandas as pd
 
 from concurrent.futures import ProcessPoolExecutor
-from typing import Union, Optional, List
+from typing import Union, Optional
 from typing_extensions import Literal
+import pint
+
+from pint.quantity import Quantity
 
 from navis.nbl.smat import Lookup2d, smat_fcwb
 
 from .. import utils, config
 from ..core import NeuronList, Dotprops, make_dotprops
 from .base import Blaster, NestedIndices
+from ..units import as_unit
 
 __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist']
 
@@ -105,7 +109,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto',
         else:
             self.distance_upper_bound = limit_dist
 
-    def append(self, dotprops) -> NestedIndices:
+    def append(self, dotprops, ignore_units=False) -> NestedIndices:
         """Append dotprops.
 
         Returns the numerical index appended dotprops.
@@ -113,15 +117,28 @@ def append(self, dotprops) -> NestedIndices:
         return a (possibly nested) list of indices.
         """
         if isinstance(dotprops, Dotprops):
-            return self._append_dotprops(dotprops)
+            return self._append_dotprops(dotprops, ignore_units)
 
         try:
-            return [self.append(n) for n in dotprops]
+            return [self.append(n, ignore_units) for n in dotprops]
         except TypeError:  # i.e. not iterable
             raise ValueError(f"Expected Dotprops or iterable thereof; got {type(dotprops)}")
 
-    def _append_dotprops(self, dotprops: Dotprops) -> int:
-        next_id = len(self)
+    def _append_dotprops(self, dotprops: Dotprops, ignore_units) -> int:
+        if not ignore_units:
+            # if isinstance(dotprops.units, pint.Quantity):
+            #     if np.allclose(1, dotprops.units):
+            #         units = dotprops.units.units
+            #     else:
+            #         logger.warning(
+            #             "Dotprops coordinates are not unitary (%s). "
+            #             "This might lead to unexpected results.",
+            #             dotprops.units
+            #         )
+            # elif dotprops:
+            #     units = as_unit(dotprops.units)
+
+            # if as_unit(dotprops.units)
         self.neurons.append(dotprops)
         self.ids.append(dotprops.id)
         # Calculate score for self hit
diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index 8aa9ee8c..a18fc8ed 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -28,14 +28,18 @@
 import math
 from collections import defaultdict
 
+import pint
 import numpy as np
 import pandas as pd
 
+from ..config import ureg
 from ..core.neurons import Dotprops
+from ..units import parse_quantity, reduce_units, as_unit, DIMENSIONLESS
 
 logger = logging.getLogger(__name__)
 
 DEFAULT_SEED = 1991
+DIMENSIONLESS = pint.Unit("dimensionless")
 
 epsilon = sys.float_info.epsilon
 cpu_count = max(1, (os.cpu_count() or 2) - 1)
@@ -433,7 +437,23 @@ def is_monotonically_increasing(lst):
     return True
 
 
-def parse_boundary(item: str):
+def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]:
+    """Parse a string representing a half-open interval.
+
+    Parameters
+    ----------
+    item : str
+        An interval formatted like ``[1 nm, inf nm)``
+
+    Returns
+    -------
+    Tuple[Tuple[float, float], bool, pint.Unit]
+        (
+            (lower_bound, upper_bound),
+            include_right,
+            units
+        )
+    """
     explicit_interval = item[0] + item[-1]
     if explicit_interval == "[)":
         right = False
@@ -443,7 +463,18 @@ def parse_boundary(item: str):
         raise ValueError(
             f"Enclosing characters '{explicit_interval}' do not match a half-open interval"
         )
-    return tuple(float(i) for i in item[1:-1].split(",")), right
+
+    out = []
+    units = []
+    for s in item[1:-1].split(","):
+        q = parse_quantity(s)
+        out.append(q.magnitude)
+        units.append(q.units)
+
+    if len(out) != 2:
+        raise ValueError(f"Could not parse interval, got {len(out)} values instead of 2")
+
+    return tuple(out), right, reduce_units(units)
 
 
 T = TypeVar("T")
@@ -452,6 +483,8 @@ def parse_boundary(item: str):
 class LookupAxis(ABC, Generic[T]):
     """Class converting some data into a linear index."""
 
+    units: pint.Unit = DIMENSIONLESS
+
     @abstractmethod
     def __len__(self) -> int:
         """Number of bins represented by this instance."""
@@ -473,6 +506,13 @@ def __call__(self, value: Union[T, Sequence[T]]) -> Union[int, Sequence[int]]:
         """
         pass
 
+    def same_units(self, unit: Union[str, pint.Unit]) -> bool:
+        try:
+            reduce_units([self.units, as_unit(unit)])
+        except ValueError:
+            return False
+        return True
+
 
 class SimpleLookup(LookupAxis[Hashable]):
     def __init__(self, items: List[Hashable]):
@@ -508,6 +548,8 @@ def __init__(
         boundaries: Sequence[float],
         clip: Tuple[bool, bool] = (True, True),
         right=False,
+        *,
+        units=DIMENSIONLESS,
     ):
         """Class converting continuous values into discrete indices.
 
@@ -546,6 +588,7 @@ def __init__(
             raise ValueError("Boundaries are not monotonically increasing")
 
         self.boundaries = np.asarray(boundaries)
+        self.units = as_unit(units)
 
     def __len__(self):
         return len(self.boundaries) - 1
@@ -570,8 +613,14 @@ def to_strings(self) -> List[str]:
         b = self.boundaries.copy()
         b[0] = self._min
         b[-1] = self._max
+
+        if self.units == DIMENSIONLESS:
+            unit = ""
+        else:
+            unit = " " + str(self.units)
+
         return [
-            f"{lb}{lower},{upper}{rb}"
+            f"{lb}{lower}{unit},{upper}{unit}{rb}"
             for lower, upper in zip(b[:-1], b[1:])
         ]
 
@@ -596,8 +645,10 @@ def from_strings(cls, interval_strs: Sequence[str]):
         bounds: List[float] = []
         last_upper = None
         last_right = None
+        units = []
         for item in interval_strs:
-            (lower, upper), right = parse_boundary(item)
+            (lower, upper), right, unit = parse_interval(item)
+            units.append(unit)
             bounds.append(float(lower))
 
             if last_right is not None:
@@ -613,10 +664,10 @@ def from_strings(cls, interval_strs: Sequence[str]):
             last_upper = upper
 
         bounds.append(float(last_upper))
-        return cls(bounds, right=last_right)
+        return cls(bounds, right=last_right, units=reduce_units(units))
 
     @classmethod
-    def from_linear(cls, lower: float, upper: float, nbins: int, right=False):
+    def from_linear(cls, lower: float, upper: float, nbins: int, right=False, *, units=DIMENSIONLESS):
         """Choose digitizer boundaries spaced linearly between two values.
 
         Input values will be clipped to fit within the given interval.
@@ -638,10 +689,10 @@ def from_linear(cls, lower: float, upper: float, nbins: int, right=False):
         Digitizer
         """
         arr = np.linspace(lower, upper, nbins + 1, endpoint=True)
-        return cls(arr, right=right)
+        return cls(arr, right=right, units=units)
 
     @classmethod
-    def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False):
+    def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False, *, units=DIMENSIONLESS):
         """Choose digitizer boundaries in a geometric sequence.
 
         Additional bins will be added above and below the given values.
@@ -664,7 +715,7 @@ def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=
         Digitizer
         """
         arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True)
-        return cls(arr, clip=(False, False), right=right)
+        return cls(arr, clip=(False, False), right=right, units=units)
 
     @classmethod
     def from_data(cls, data: Sequence[float], nbins: int, right=False):
@@ -684,8 +735,13 @@ def from_data(cls, data: Sequence[float], nbins: int, right=False):
         -------
         Digitizer
         """
+        if isinstance(data, pint.Quantity):
+            data = data.magnitude
+            units = data.units
+        else:
+            units = DIMENSIONLESS
         arr = np.quantile(data, np.linspace(0, 1, nbins + 1, True))
-        return cls(arr, right=right)
+        return cls(arr, right=right, units=units)
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Digitizer):
@@ -712,6 +768,10 @@ def __call__(self, *args):
         out = self.cells[idxs]
         return out
 
+    @property
+    def units(self):
+        return tuple(ax.units for ax in self.axes)
+
 
 class Lookup2d(LookupNd):
     """Convenience class inheriting from LookupNd for the common 2D float case.
@@ -776,7 +836,8 @@ def _smat_fcwb(alpha=False):
     fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha]
     fpath = smat_path / fname
 
-    return Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0))
+    lookup = Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0))
+    lookup.axes[0].units = ureg.Unit("micrometer")
 
 
 def smat_fcwb(alpha=False):
diff --git a/navis/units.py b/navis/units.py
new file mode 100644
index 00000000..85a39535
--- /dev/null
+++ b/navis/units.py
@@ -0,0 +1,97 @@
+import re
+from typing import Optional, Union, Sequence
+
+import numpy as np
+import pint
+
+DIMENSIONLESS = pint.Unit("dimensionless")
+
+INF_RE = re.compile(r"(?P<is_neg>-?)inf(inity)?")
+
+ureg = pint.UnitRegistry()
+ureg.define("@alias micron = micrometer")
+
+
+def parse_quantity(item: str) -> pint.Quantity:
+    """Parse strings into ``pint.Quantity``, accounting for infinity.
+
+    Parameters
+    ----------
+    item : str
+        A quantity string like those used by ``pint``.
+
+    Returns
+    -------
+    pint.Quantity
+    """
+    item = item.strip()
+    try:
+        q = ureg.Quantity(item)
+    except pint.UndefinedUnitError as e:
+        first, *other = item.split()
+        m = INF_RE.match(first)
+        if m is None:
+            raise e
+
+        val = float("inf")
+        if m.groupdict()["is_neg"]:
+            val *= -1
+        unit = ureg.Unit(" ".join(other))
+        q = ureg.Quantity(val, unit)
+
+    return q
+
+
+def as_unit(unit: Optional[Union[str, pint.Unit]]) -> pint.Unit:
+    """Convert a string (or None) into a ``pint.Unit``
+
+    Parameters
+    ----------
+    unit : Optional[Union[str, pint.Unit]]
+
+    Returns
+    -------
+    pint.Unit
+        If the ``unit`` argument was ``None``, return dimensionless.
+    """
+    if unit is None:
+        return DIMENSIONLESS
+
+    if isinstance(unit, pint.Unit):
+        return unit
+
+    return ureg.Unit(unit)
+
+
+def reduce_units(units: Sequence[Optional[Union[str, pint.Unit]]]) -> pint.Unit:
+    """Reduce a sequence of units or unit-like strings down to a single ``pint.Unit``.
+
+    Dimensionless units are ignored.
+
+    Parameters
+    ----------
+    units : Sequence[Optional[Union[str, pint.Unit]]]
+        ``None`` is treated as dimensionless.
+
+    Returns
+    -------
+    pint.Unit
+        Consensus units of the sequence.
+
+    Raises
+    ------
+    ValueError
+        If more than one non-dimensionless unit is found.
+    """
+    # use np.unique instead of set operations here,
+    # because setting aliases in the registry affects
+    # __eq__ (comparisons as used by np.unique) but not
+    # __hash__ (as used by sets)
+    unit_set = np.unique([DIMENSIONLESS] + [as_unit(u1) for u1 in units])
+    if len(unit_set) == 1:
+        return DIMENSIONLESS
+    actuals = list(unit_set)
+    actuals.remove(DIMENSIONLESS)
+    if len(actuals) == 1:
+        return actuals[0]
+    raise ValueError(f"More than one real unit found: {sorted(unit_set)}")

From dfc6b4bf686e087cddbc397f5ab86c288220c9ca Mon Sep 17 00:00:00 2001
From: Chris Barnes <cbarnes@mrc-lmb.cam.ac.uk>
Date: Mon, 11 Apr 2022 13:39:58 +0100
Subject: [PATCH 21/21] Revert "WIP units"

This reverts commit 62b80943f38fdd236443e9d0ce457e4e818eff74.
---
 navis/config.py           |  5 +-
 navis/nbl/nblast_funcs.py | 29 +++---------
 navis/nbl/smat.py         | 83 +++++----------------------------
 navis/units.py            | 97 ---------------------------------------
 4 files changed, 20 insertions(+), 194 deletions(-)
 delete mode 100644 navis/units.py

diff --git a/navis/config.py b/navis/config.py
index 8b802906..dd380d17 100644
--- a/navis/config.py
+++ b/navis/config.py
@@ -17,8 +17,6 @@
 
 import matplotlib as mpl
 
-from .units import ureg  # noqa: F401
-
 logger = logging.getLogger('navis')
 
 
@@ -71,6 +69,9 @@ def remove_log_handlers():
 # Default color for neurons
 default_color = (.95, .65, .04)
 
+# Unit registry
+ureg = pint.UnitRegistry()
+
 # Set to true to prevent Viewer from ever showing
 headless = os.environ.get('NAVIS_HEADLESS', 'False').lower() == 'true'
 if headless:
diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py
index 11cde829..9bf1ffe1 100644
--- a/navis/nbl/nblast_funcs.py
+++ b/navis/nbl/nblast_funcs.py
@@ -22,18 +22,14 @@
 import pandas as pd
 
 from concurrent.futures import ProcessPoolExecutor
-from typing import Union, Optional
+from typing import Union, Optional, List
 from typing_extensions import Literal
-import pint
-
-from pint.quantity import Quantity
 
 from navis.nbl.smat import Lookup2d, smat_fcwb
 
 from .. import utils, config
 from ..core import NeuronList, Dotprops, make_dotprops
 from .base import Blaster, NestedIndices
-from ..units import as_unit
 
 __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist']
 
@@ -109,7 +105,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto',
         else:
             self.distance_upper_bound = limit_dist
 
-    def append(self, dotprops, ignore_units=False) -> NestedIndices:
+    def append(self, dotprops) -> NestedIndices:
         """Append dotprops.
 
         Returns the numerical index appended dotprops.
@@ -117,28 +113,15 @@ def append(self, dotprops, ignore_units=False) -> NestedIndices:
         return a (possibly nested) list of indices.
         """
         if isinstance(dotprops, Dotprops):
-            return self._append_dotprops(dotprops, ignore_units)
+            return self._append_dotprops(dotprops)
 
         try:
-            return [self.append(n, ignore_units) for n in dotprops]
+            return [self.append(n) for n in dotprops]
         except TypeError:  # i.e. not iterable
             raise ValueError(f"Expected Dotprops or iterable thereof; got {type(dotprops)}")
 
-    def _append_dotprops(self, dotprops: Dotprops, ignore_units) -> int:
-        if not ignore_units:
-            # if isinstance(dotprops.units, pint.Quantity):
-            #     if np.allclose(1, dotprops.units):
-            #         units = dotprops.units.units
-            #     else:
-            #         logger.warning(
-            #             "Dotprops coordinates are not unitary (%s). "
-            #             "This might lead to unexpected results.",
-            #             dotprops.units
-            #         )
-            # elif dotprops:
-            #     units = as_unit(dotprops.units)
-
-            # if as_unit(dotprops.units)
+    def _append_dotprops(self, dotprops: Dotprops) -> int:
+        next_id = len(self)
         self.neurons.append(dotprops)
         self.ids.append(dotprops.id)
         # Calculate score for self hit
diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py
index a18fc8ed..8aa9ee8c 100644
--- a/navis/nbl/smat.py
+++ b/navis/nbl/smat.py
@@ -28,18 +28,14 @@
 import math
 from collections import defaultdict
 
-import pint
 import numpy as np
 import pandas as pd
 
-from ..config import ureg
 from ..core.neurons import Dotprops
-from ..units import parse_quantity, reduce_units, as_unit, DIMENSIONLESS
 
 logger = logging.getLogger(__name__)
 
 DEFAULT_SEED = 1991
-DIMENSIONLESS = pint.Unit("dimensionless")
 
 epsilon = sys.float_info.epsilon
 cpu_count = max(1, (os.cpu_count() or 2) - 1)
@@ -437,23 +433,7 @@ def is_monotonically_increasing(lst):
     return True
 
 
-def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]:
-    """Parse a string representing a half-open interval.
-
-    Parameters
-    ----------
-    item : str
-        An interval formatted like ``[1 nm, inf nm)``
-
-    Returns
-    -------
-    Tuple[Tuple[float, float], bool, pint.Unit]
-        (
-            (lower_bound, upper_bound),
-            include_right,
-            units
-        )
-    """
+def parse_boundary(item: str):
     explicit_interval = item[0] + item[-1]
     if explicit_interval == "[)":
         right = False
@@ -463,18 +443,7 @@ def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]:
         raise ValueError(
             f"Enclosing characters '{explicit_interval}' do not match a half-open interval"
         )
-
-    out = []
-    units = []
-    for s in item[1:-1].split(","):
-        q = parse_quantity(s)
-        out.append(q.magnitude)
-        units.append(q.units)
-
-    if len(out) != 2:
-        raise ValueError(f"Could not parse interval, got {len(out)} values instead of 2")
-
-    return tuple(out), right, reduce_units(units)
+    return tuple(float(i) for i in item[1:-1].split(",")), right
 
 
 T = TypeVar("T")
@@ -483,8 +452,6 @@ def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]:
 class LookupAxis(ABC, Generic[T]):
     """Class converting some data into a linear index."""
 
-    units: pint.Unit = DIMENSIONLESS
-
     @abstractmethod
     def __len__(self) -> int:
         """Number of bins represented by this instance."""
@@ -506,13 +473,6 @@ def __call__(self, value: Union[T, Sequence[T]]) -> Union[int, Sequence[int]]:
         """
         pass
 
-    def same_units(self, unit: Union[str, pint.Unit]) -> bool:
-        try:
-            reduce_units([self.units, as_unit(unit)])
-        except ValueError:
-            return False
-        return True
-
 
 class SimpleLookup(LookupAxis[Hashable]):
     def __init__(self, items: List[Hashable]):
@@ -548,8 +508,6 @@ def __init__(
         boundaries: Sequence[float],
         clip: Tuple[bool, bool] = (True, True),
         right=False,
-        *,
-        units=DIMENSIONLESS,
     ):
         """Class converting continuous values into discrete indices.
 
@@ -588,7 +546,6 @@ def __init__(
             raise ValueError("Boundaries are not monotonically increasing")
 
         self.boundaries = np.asarray(boundaries)
-        self.units = as_unit(units)
 
     def __len__(self):
         return len(self.boundaries) - 1
@@ -613,14 +570,8 @@ def to_strings(self) -> List[str]:
         b = self.boundaries.copy()
         b[0] = self._min
         b[-1] = self._max
-
-        if self.units == DIMENSIONLESS:
-            unit = ""
-        else:
-            unit = " " + str(self.units)
-
         return [
-            f"{lb}{lower}{unit},{upper}{unit}{rb}"
+            f"{lb}{lower},{upper}{rb}"
             for lower, upper in zip(b[:-1], b[1:])
         ]
 
@@ -645,10 +596,8 @@ def from_strings(cls, interval_strs: Sequence[str]):
         bounds: List[float] = []
         last_upper = None
         last_right = None
-        units = []
         for item in interval_strs:
-            (lower, upper), right, unit = parse_interval(item)
-            units.append(unit)
+            (lower, upper), right = parse_boundary(item)
             bounds.append(float(lower))
 
             if last_right is not None:
@@ -664,10 +613,10 @@ def from_strings(cls, interval_strs: Sequence[str]):
             last_upper = upper
 
         bounds.append(float(last_upper))
-        return cls(bounds, right=last_right, units=reduce_units(units))
+        return cls(bounds, right=last_right)
 
     @classmethod
-    def from_linear(cls, lower: float, upper: float, nbins: int, right=False, *, units=DIMENSIONLESS):
+    def from_linear(cls, lower: float, upper: float, nbins: int, right=False):
         """Choose digitizer boundaries spaced linearly between two values.
 
         Input values will be clipped to fit within the given interval.
@@ -689,10 +638,10 @@ def from_linear(cls, lower: float, upper: float, nbins: int, right=False, *, uni
         Digitizer
         """
         arr = np.linspace(lower, upper, nbins + 1, endpoint=True)
-        return cls(arr, right=right, units=units)
+        return cls(arr, right=right)
 
     @classmethod
-    def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False, *, units=DIMENSIONLESS):
+    def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False):
         """Choose digitizer boundaries in a geometric sequence.
 
         Additional bins will be added above and below the given values.
@@ -715,7 +664,7 @@ def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=
         Digitizer
         """
         arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True)
-        return cls(arr, clip=(False, False), right=right, units=units)
+        return cls(arr, clip=(False, False), right=right)
 
     @classmethod
     def from_data(cls, data: Sequence[float], nbins: int, right=False):
@@ -735,13 +684,8 @@ def from_data(cls, data: Sequence[float], nbins: int, right=False):
         -------
         Digitizer
         """
-        if isinstance(data, pint.Quantity):
-            data = data.magnitude
-            units = data.units
-        else:
-            units = DIMENSIONLESS
         arr = np.quantile(data, np.linspace(0, 1, nbins + 1, True))
-        return cls(arr, right=right, units=units)
+        return cls(arr, right=right)
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Digitizer):
@@ -768,10 +712,6 @@ def __call__(self, *args):
         out = self.cells[idxs]
         return out
 
-    @property
-    def units(self):
-        return tuple(ax.units for ax in self.axes)
-
 
 class Lookup2d(LookupNd):
     """Convenience class inheriting from LookupNd for the common 2D float case.
@@ -836,8 +776,7 @@ def _smat_fcwb(alpha=False):
     fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha]
     fpath = smat_path / fname
 
-    lookup = Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0))
-    lookup.axes[0].units = ureg.Unit("micrometer")
+    return Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0))
 
 
 def smat_fcwb(alpha=False):
diff --git a/navis/units.py b/navis/units.py
deleted file mode 100644
index 85a39535..00000000
--- a/navis/units.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import re
-from typing import Optional, Union, Sequence
-
-import numpy as np
-import pint
-
-DIMENSIONLESS = pint.Unit("dimensionless")
-
-INF_RE = re.compile(r"(?P<is_neg>-?)inf(inity)?")
-
-ureg = pint.UnitRegistry()
-ureg.define("@alias micron = micrometer")
-
-
-def parse_quantity(item: str) -> pint.Quantity:
-    """Parse strings into ``pint.Quantity``, accounting for infinity.
-
-    Parameters
-    ----------
-    item : str
-        A quantity string like those used by ``pint``.
-
-    Returns
-    -------
-    pint.Quantity
-    """
-    item = item.strip()
-    try:
-        q = ureg.Quantity(item)
-    except pint.UndefinedUnitError as e:
-        first, *other = item.split()
-        m = INF_RE.match(first)
-        if m is None:
-            raise e
-
-        val = float("inf")
-        if m.groupdict()["is_neg"]:
-            val *= -1
-        unit = ureg.Unit(" ".join(other))
-        q = ureg.Quantity(val, unit)
-
-    return q
-
-
-def as_unit(unit: Optional[Union[str, pint.Unit]]) -> pint.Unit:
-    """Convert a string (or None) into a ``pint.Unit``
-
-    Parameters
-    ----------
-    unit : Optional[Union[str, pint.Unit]]
-
-    Returns
-    -------
-    pint.Unit
-        If the ``unit`` argument was ``None``, return dimensionless.
-    """
-    if unit is None:
-        return DIMENSIONLESS
-
-    if isinstance(unit, pint.Unit):
-        return unit
-
-    return ureg.Unit(unit)
-
-
-def reduce_units(units: Sequence[Optional[Union[str, pint.Unit]]]) -> pint.Unit:
-    """Reduce a sequence of units or unit-like strings down to a single ``pint.Unit``.
-
-    Dimensionless units are ignored.
-
-    Parameters
-    ----------
-    units : Sequence[Optional[Union[str, pint.Unit]]]
-        ``None`` is treated as dimensionless.
-
-    Returns
-    -------
-    pint.Unit
-        Consensus units of the sequence.
-
-    Raises
-    ------
-    ValueError
-        If more than one non-dimensionless unit is found.
-    """
-    # use np.unique instead of set operations here,
-    # because setting aliases in the registry affects
-    # __eq__ (comparisons as used by np.unique) but not
-    # __hash__ (as used by sets)
-    unit_set = np.unique([DIMENSIONLESS] + [as_unit(u1) for u1 in units])
-    if len(unit_set) == 1:
-        return DIMENSIONLESS
-    actuals = list(unit_set)
-    actuals.remove(DIMENSIONLESS)
-    if len(actuals) == 1:
-        return actuals[0]
-    raise ValueError(f"More than one real unit found: {sorted(unit_set)}")