From c4b123d1055a1916fcfa87e8b1ca1f128cad2704 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 25 Nov 2021 14:59:52 +0000 Subject: [PATCH 01/21] conftest: minor fixture changes --- tests/conftest.py | 53 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index a35954d7..ec872834 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ from pathlib import Path import os +from typing import List import pandas as pd import pytest @@ -9,7 +10,7 @@ import navis -@pytest.fixture +@pytest.fixture(scope="session") def data_dir(): return Path(__file__).resolve().parent.parent / "navis" / "data" @@ -17,8 +18,8 @@ def data_dir(): @pytest.fixture( params=["Path", "pathstr", "swcstr", "textbuffer", "rawbuffer", "DataFrame"] ) -def swc_source(request, data_dir: Path): - swc_path: Path = data_dir / "swc" / "722817260.swc" +def swc_source(request, swc_paths: List[Path]): + swc_path: Path = swc_paths[0] if request.param == "Path": yield swc_path elif request.param == "pathstr": @@ -42,9 +43,9 @@ def swc_source(request, data_dir: Path): @pytest.fixture( params=["dirstr", "dirpath", "list", "listwithdir"], ) -def swc_source_multi(request, data_dir: Path): - dpath = data_dir / "swc" - fpath = dpath / "722817260.swc" +def swc_source_multi(request, swc_paths: List[Path]): + fpath = swc_paths[0] + dpath = fpath.parent if request.param == "dirstr": yield str(dpath) elif request.param == "dirpath": @@ -74,3 +75,43 @@ def voxel_nrrd_path(tmp_path): nrrd.write(os.fspath(path), data, header) return path + + +def data_paths(dpath, glob="*"): + return sorted(dpath.glob(glob)) + + +@pytest.fixture(scope="session") +def swc_paths(data_dir: Path): + return data_paths(data_dir / "swc", "*.swc") + + +@pytest.fixture(scope="session") +def gml_paths(data_dir: Path): + return data_paths(data_dir / "gml", "*.gml") + + +@pytest.fixture(scope="session") +def obj_paths(data_dir: Path): + return data_paths(data_dir / "obj", "*.obj") + + +@pytest.fixture(scope="session") +def synapses_paths(data_dir: Path): + return data_paths(data_dir / "synapses", "*.csv") + + +@pytest.fixture(scope="session") +def volumes_paths(data_dir: Path): + return data_paths(data_dir / "volumes", "*.obj") + + +@pytest.fixture +def treeneuron_dfs(swc_paths, synapses_paths): + swc_reader = navis.io.swc_io.SwcReader() + out = [] + for swc_path, syn_path in zip(swc_paths, synapses_paths): + neuron = swc_reader.read_file_path(swc_path) + neuron.connectors = pd.read_csv(syn_path) + out.append(neuron) + return out From 30abb72c96f97d69972e81d9ebfe65b3c7205d7b Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 25 Nov 2021 15:00:15 +0000 Subject: [PATCH 02/21] Dotprops.__len__ --- navis/core/dotprop.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/navis/core/dotprop.py b/navis/core/dotprop.py index c83b04f9..c4e5c64e 100644 --- a/navis/core/dotprop.py +++ b/navis/core/dotprop.py @@ -622,3 +622,6 @@ def to_skeleton(self, tn._soma = self._soma return tn + + def __len__(self): + return len(self.points) From 35281046df2360fb0765c39c0bf07d52a8463ab2 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 25 Nov 2021 15:00:37 +0000 Subject: [PATCH 03/21] Initial implementation of smat builder --- navis/nbl/smat.py | 539 ++++++++++++++++++++++++++++++++++++ tests/test_nbl/__init__.py | 0 tests/test_nbl/test_smat.py | 161 +++++++++++ 3 files changed, 700 insertions(+) create mode 100644 navis/nbl/smat.py create mode 100644 tests/test_nbl/__init__.py create mode 100644 tests/test_nbl/test_smat.py diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py new file mode 100644 index 00000000..01dc6f2a --- /dev/null +++ b/navis/nbl/smat.py @@ -0,0 +1,539 @@ +from __future__ import annotations +from itertools import permutations +import sys +import os +from collections import Counter +from concurrent.futures import ProcessPoolExecutor +from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple +import logging +from pathlib import Path +from functools import lru_cache +from copy import deepcopy +import operator + +import numpy as np +import pandas as pd + +from ..core.neurons import Dotprops + +logger = logging.getLogger(__name__) + +DEFAULT_SEED = 1991 + +epsilon = sys.float_info.epsilon +cpu_count = max(1, (os.cpu_count() or 2) - 1) +IMPLICIT_INTERVAL = "[)" + +fp = Path(__file__).resolve().parent +smat_path = fp / 'score_mats' + + +def chunksize(it_len, cpu_count, min_chunk=50): + return max(min_chunk, int(it_len / (cpu_count * 4))) + + +def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf): + """Ensures that boundaries cover -inf to inf. + If the left or rightmost values lie within the interval ``(left, right)``, + ``-inf`` or ``inf`` will be prepended or appended respectively. + Otherwise, the left and right values will be set to + ``-inf`` and ``inf`` respectively. + Parameters + ---------- + arr : array-like of floats + Array of boundaries + left : float, optional + If the first value is greater than this, prepend -inf; + otherwise replace that value with -inf. + Defaults to -inf if None. + right : float, optional + If the last value is less than this, append inf; + otherwise replace that value with inf. + Defaults to inf if None. + Returns + ------- + np.ndarray of floats + Boundaries from -inf to inf + Raises + ------ + ValueError + If values of ``arr`` are not monotonically increasing. + """ + if left is None: + left = -np.inf + if right is None: + right = np.inf + + if not np.all(np.diff(arr) > 0): + raise ValueError("Boundaries are not monotonically increasing") + + items = list(arr) + if items[0] <= left: + items[0] = -np.inf + else: + items.insert(0, -np.inf) + + if items[-1] >= right: + items[-1] = np.inf + else: + items.append(np.inf) + + return np.array(items) + + +def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any]]: + for a, b in pairs: + if a != b: + yield a, b + + +class LookupNdBuilder: + def __init__( + self, + dotprops, + matching_sets, + boundaries: Sequence[Sequence[float]], + match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]], + nonmatching=None, + seed=DEFAULT_SEED, + ) -> None: + f"""Class for building an N-dimensional score lookup for NBLAST. + Parameters + ---------- + dotprops : dict or list of Dotprops + An indexable sequence of all neurons which will be used as the training set, + as Dotprops objects. + matching_sets : list of sets of int + Sets of neurons, as indices into ``dotprops``, which should be considered matches. + boundaries : sequence of array-likes of floats + List of lists, where the inner lists are monotonically increasing + from -inf to inf. + The length of the outer list is the dimensionality of the lookup table. + The inner lists are the boundaries of bins for that dimension, + i.e. an inner list of N items describes N-1 bins. + If an inner list is not inf-bounded, + -inf and inf will be prepended and appended. + See the ``wrap_bounds`` convenience function. + match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]] + Function taking 2 arguments, + both instances of ``navis.core.neurons.Dotprops``, + and returning a list of 1D ``numpy.ndarray``s of floats. + The length of the list must be the same as the length of ``boundaries``. + The length of the ``array``s must be the same + as the number of points in the first argument. + This function returns values describing the quality of + point matches from a query to a target neuron. + nonmatching : set of int, optional + Set of neurons, as indices into ``dotprops``, + which should not be considered matches. + If not given, all ``dotprops`` will be used + (on the assumption that matches are a small subset of possible pairs). + seed : int, optional + Non-matching pairs are drawn at random using this seed, + by default {DEFAULT_SEED} + """ + self.dotprops = dotprops + self.matching_sets = matching_sets + self._nonmatching = nonmatching + self.match_fn = match_fn + self.boundaries = [wrap_bounds(b) for b in boundaries] + + self.rng = np.random.default_rng(seed) + + def _dotprop_keys(self): + try: + return self.dotprops.keys() + except AttributeError: + return range(len(self.dotprops)) + + @property + def nonmatching(self): + """Indices of nonmatching set of neurons""" + if self._nonmatching is None: + return set(self._dotprop_keys()) + return self._nonmatching + + def _yield_matching_pairs(self): + for ms in self.matching_sets: + yield from yield_not_same(permutations(ms, 2)) + + def _yield_nonmatching_pairs(self): + # todo: this could be much better, use meshgrid or shuffle index arrays + return yield_not_same(permutations(self.nonmatching, 2)) + + def _empty_counts(self): + shape = [len(b) - 1 for b in self.boundaries] + return np.zeros(shape, int) + + def _query_to_idxs(self, q_idx, t_idx, counts=None): + response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx]) + idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)] + + if counts is None: + counts = self._empty_counts() + + for idx in zip(*idxs): + counts[idx] += 1 + + return counts + + def _counts_array(self, idx_pairs, threads=None): + counts = self._empty_counts() + if threads is None or threads == 0 and cpu_count == 1: + for q_idx, t_idx in idx_pairs: + counts = self._query_to_idxs(q_idx, t_idx, counts) + return counts + + threads = threads or cpu_count + idx_pairs = np.asarray(idx_pairs, dtype=int) + chunks = chunksize(len(idx_pairs), threads) + + with ProcessPoolExecutor(threads) as exe: + for these_counts in exe.map( + self._query_to_idxs, + idx_pairs[:, 0], + idx_pairs[:, 1], + chunksize=chunks, + ): + counts += these_counts + + return counts + + def _pick_nonmatching_pairs(self, n_matching_qual_vals): + # pre-calculating which pairs we're going to use, + # rather than drawing them as we need them, + # means that we can parallelise the later step more effectively. + # Slowdowns here are practically meaningless + # because of how long distdot calculation will take + all_nonmatching_pairs = list(self._yield_nonmatching_pairs()) + nonmatching_pairs = [] + n_nonmatching_qual_vals = 0 + while n_nonmatching_qual_vals < n_matching_qual_vals: + idx = self.rng.integers(0, len(all_nonmatching_pairs)) + nonmatching_pair = all_nonmatching_pairs.pop(idx) + nonmatching_pairs.append(nonmatching_pair) + n_nonmatching_qual_vals += len(self.dotprops[nonmatching_pair[0]]) + + return nonmatching_pairs + + def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: + matching_pairs = list(set(self._yield_matching_pairs())) + # need to know the eventual distdot count + # so we know how many non-matching pairs to draw + q_idx_count = Counter(p[0] for p in matching_pairs) + n_matching_qual_vals = sum( + len(self.dotprops[q_idx]) * n_reps for q_idx, n_reps in q_idx_count.items() + ) + + nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals) + + match_counts = self._counts_array(matching_pairs, threads) + nonmatch_counts = self._counts_array(nonmatching_pairs, threads) + + # account for there being different total numbers of match/nonmatch dist dots + matching_factor = nonmatch_counts.sum() / match_counts.sum() + if np.any(match_counts + nonmatch_counts == 0): + logger.warning("Some lookup cells have no data in them") + + cells = np.log2( + (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon) + ) + + return self.boundaries, cells + + def build(self, threads=None) -> LookupNd: + """Build the score matrix. + All non-identical neuron pairs within all matching sets are selected, + and distdots calculated for those pairs. + Then, the minimum number of non-matching pairs are randomly drawn + so that at least as many distdots can be calculated for non-matching + pairs. + In each bin of the score matrix, the log2 odds ratio of a distdot + in that bin belonging to a match vs. non-match is calculated. + Parameters + ---------- + threads : int, optional + If None, act in serial. + If 0, use cpu_count - 1. + Otherwise, use the given value. + Returns + ------- + pd.DataFrame + Suitable for passing to navis.nbl.ScoringFunction + Raises + ------ + ValueError + If dist_bins or dot_bins are not set. + """ + return LookupNd(*self._build(threads)) + + +def dist_dot(q: Dotprops, t: Dotprops): + return list(q.dist_dots(t)) + + +def dist_dot_alpha(q: Dotprops, t: Dotprops): + dist, dot, alpha = q.dist_dots(t, alpha=True) + return [dist, dot * np.sqrt(alpha)] + + +class LookupDistDotBuilder(LookupNdBuilder): + def __init__( + self, + dotprops, + matching_sets, + dist_boundaries, + dot_boundaries, + nonmatching=None, + use_alpha=False, + seed=DEFAULT_SEED, + ): + f"""Class for building a 2-dimensional score lookup for NBLAST. + The scores are + 1. The distances between best-matching points + 2. The dot products of direction vectors around those points, + optionally scaled by the colinearity ``alpha``. + Parameters + ---------- + dotprops : dict or list of Dotprops + An indexable sequence of all neurons which will be used as the training set, + as Dotprops objects. + matching_sets : list of sets of int + Sets of neurons, as indices into ``dotprops``, which should be considered matches. + dist_boundaries : array-like of floats + Monotonically increasing boundaries between distance bins + (i.e. N values makes N-1 bins). + If the first value > 0, -inf will be prepended, + otherwise the first value will be replaced with -inf. + inf will be appended unless the last value is already inf. + Consider using ``numpy.logspace`` or ``numpy.geomspace`` + to generate the inner boundaries of the bins. + dot_boundaries : array-like of floats + Monotonically increasing boundaries between + (possibly alpha-scaled) dot product bins + (i.e. N values makes N-1 bins). + If the first value > 0, -inf will be prepended, + otherwise the first value will be replaced with -inf. + If the last value < 1, inf will be appended, + otherwise the last value will be replaced with inf. + Consider using ``numpy.linspace`` between 0 and 1. + nonmatching : set of int, optional + Set of neurons, as indices into ``dotprops``, + which should not be considered matches. + If not given, all ``dotprops`` will be used + (on the assumption that matches are a small subset of possible pairs). + use_alpha : bool, optional + If true, multiply the dot product by the geometric mean + of the matched points' alpha values + (i.e. ``sqrt(alpha1 * alpha2)``). + seed : int, optional + Non-matching pairs are drawn at random using this seed, + by default {DEFAULT_SEED} + """ + match_fn = dist_dot_alpha if use_alpha else dist_dot + super().__init__( + dotprops, + matching_sets, + [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)], + match_fn, + nonmatching, + seed, + ) + + def build(self, threads=None) -> Lookup2d: + return Lookup2d(*self._build(threads)) + + +class LookupNd: + def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray): + if [len(b) - 1 for b in boundaries] != list(cells.shape): + raise ValueError("boundaries and cells have inconsistent bin counts") + self.boundaries = boundaries + self.cells = cells + + def __call__(self, *args): + if len(args) != len(self.boundaries): + raise TypeError( + f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given" + ) + + idxs = tuple( + np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries) + ) + out = self.cells[idxs] + return out + + +def format_boundaries(arr): + return [f"[{lower},{upper})" for lower, upper in zip(arr[:-1], arr[1:])] + + +def parse_boundary(item, strict=False): + explicit_interval = item[0] + item[-1] + if strict and item[0] + item[-1] != IMPLICIT_INTERVAL: + raise ValueError( + f"Enclosing characters {explicit_interval} " + f"do not match implicit interval specifiers {IMPLICIT_INTERVAL}" + ) + return tuple(float(i) for i in item[1:-1].split(",")) + + +def parse_boundaries(items, strict=False): + # declaring upper first and then checking for None + # pleases the type checker, and handles the case where + # len(items) == 0 + upper = None + for item in items: + lower, upper = parse_boundary(item, strict) + yield lower + if upper is None: + return + yield upper + + +class Lookup2d(LookupNd): + """Convenience class inheriting from LookupNd for the common 2D case. + Provides IO with pandas DataFrames. + """ + + def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray): + if len(boundaries) != 2: + raise ValueError("boundaries must be of length 2; cells must be 2D") + super().__init__(boundaries, cells) + + def to_dataframe(self) -> pd.DataFrame: + # numpy.digitize includes left, excludes right, i.e. "[left,right)" + return pd.DataFrame( + self.cells, + format_boundaries(self.boundaries[0]), + format_boundaries(self.boundaries[1]), + ) + + @classmethod + def from_dataframe(cls, df: pd.DataFrame, strict=False): + f"""Parse score matrix from a dataframe with string index and column labels. + Expects the index and column labels to specify an interval + like ``f"[{{lower}},{{upper}})"``. + Will replace the lowermost and uppermost bound with -inf and inf + if they are not already. + Parameters + ---------- + strict : bool, optional + If falsey (default), ignores parentheses and brackets, + effectively normalising to + ``{IMPLICIT_INTERVAL[0]}lower,upper{IMPLICIT_INTERVAL[1]}`` + as an implementation detail. + Otherwise, raises a ValueError if parens/brackets + do not match the implementation. + """ + boundaries = [] + for arr in (df.index, df.columns): + b = np.array(list(parse_boundaries(arr, strict))) + b[0] = -np.inf + b[-1] = np.inf + boundaries.append(b) + + return cls(boundaries, df.to_numpy()) + + +@lru_cache(maxsize=None) +def _smat_fcwb(alpha=False): + # cached private function defers construction + # until needed (speeding startup), + # but avoids repeated reads (speeding later uses) + fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha] + fpath = smat_path / fname + + return Lookup2d.from_dataframe( + pd.read_csv(fpath, index_col=0) + ) + + +def smat_fcwb(alpha=False): + # deepcopied so that mutations do not propagate to cache + return deepcopy(_smat_fcwb(alpha)) + + +def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True): + """Checks functionally that the callable can be used as a score function. + Parameters + ---------- + nargs : optional int, default 2 + How many positional arguments the score function should have. + scalar : optional bool, default True + Check that the function can be used on ``nargs`` scalars. + array : optional bool, default True + Check that the function can be used on ``nargs`` 1D ``numpy.ndarray``s. + Raises + ------ + ValueError + If the score function is not appropriate. + """ + if scalar: + scalars = [0.5] * nargs + if not isinstance(fn(*scalars), float): + raise ValueError("smat does not take 2 floats and return a float") + + if array: + test_arr = np.array([0.5] * 3) + arrs = [test_arr] * nargs + try: + out = fn(*arrs) + except Exception as e: + raise ValueError(f"Failed to use smat with numpy arrays: {e}") + + if out.shape != test_arr.shape: + raise ValueError( + f"smat produced inconsistent shape: input {test_arr.shape}; output {out.shape}" + ) + + +SCORE_FN_DESCR = """ +NBLAST score functions take 2 floats or numpy arrays of floats of length N +(for matched dotprop points/tangents, distance and dot product; +the latter possibly scaled by the geometric mean of the alpha colinearity values) +and returns a float or N-length numpy array of floats. +""".strip().replace("\n", " ") + + +def parse_score_fn(smat, alpha=False): + f"""Interpret ``smat`` as a score function. + Primarily for backwards compatibility. + {SCORE_FN_DESCR} + Parameters + ---------- + smat : None | "auto" | str | os.PathLike | pandas.DataFrame | Callable[[float, float], float] + If ``None``, use ``operator.mul``. + If ``"auto"``, use ``navis.nbl.smat.smat_fcwb(alpha)``. + If a dataframe, use ``navis.nbl.smat.Lookup2d.from_dataframe(smat)``. + If another string or path-like, load from CSV in a dataframe and uses as above. + Also checks the signature of the callable. + Raises an error, probably a ValueError, if it can't be interpreted. + alpha : optional bool, default False + If ``smat`` is None, choose whether to use the FCWB matrices + with or without alpha. + Returns + ------- + Callable + Raises + ------ + ValueError + If score function cannot be interpreted. + """ + if smat is None: + smat = operator.mul + elif smat == 'auto': + smat = smat_fcwb(alpha) + + if isinstance(smat, (str, os.PathLike)): + smat = pd.read_csv(smat, index_col=0) + + if isinstance(smat, pd.DataFrame): + smat = Lookup2d.from_dataframe(smat) + + if not callable(smat): + raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") + + check_score_fn(smat) + + return smat diff --git a/tests/test_nbl/__init__.py b/tests/test_nbl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py new file mode 100644 index 00000000..93959da2 --- /dev/null +++ b/tests/test_nbl/test_smat.py @@ -0,0 +1,161 @@ +from navis import Dotprops +import pytest + +import numpy as np + +from navis.nbl.smat import ( + wrap_bounds, LookupNd, Lookup2d, LookupDistDotBuilder +) + + +SMALLEST_DIM_SIZE = 3 +SEED = 1991 + + +@pytest.mark.parametrize( + ["arr", "left", "right", "expected"], + [ + ([1, 2], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]), + ([-np.inf, 1, 2, np.inf], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]), + ([0, 1, 2, 3], 0, 3, [-np.inf, 1, 2, np.inf]), + ], +) +def test_wrap_bounds(arr, left, right, expected): + assert np.allclose(wrap_bounds(arr, left, right), expected) + + +def test_wrap_bounds_error(): + with pytest.raises(ValueError): + wrap_bounds([1, 2, 1]) + + +def lookup_args(ndim): + f""" + Create arguments for an ND lookup table. + The first dimension is of size {SMALLEST_DIM_SIZE}, + and subsequent dimensions are 1 longer than the previous. + The data in the cells are the sequence from 0 + to the size of the array. + The boundaries are 0 to the length of the dimension, + with the left and rightmost values replaced with -inf and inf respectively. + Examples + -------- + >>> lookup_args(2) + ( + [ + array([-inf, 1, 2, inf]), + array([-inf, 1, 2, 3, inf]), + ], + array([ + [ 0, 1, 2, 3 ], + [ 4, 5, 6, 7 ], + [ 8, 9, 10, 11 ], + ]), + ) + """ + shape = tuple(range(SMALLEST_DIM_SIZE, SMALLEST_DIM_SIZE + ndim)) + cells = np.arange(np.product(shape)).reshape(shape) + boundaries = [] + for s in shape: + b = np.arange(s + 1, dtype=float) + b[0] = -np.inf + b[-1] = np.inf + boundaries.append(b) + return boundaries, cells + + +def fmt_array(arg): + if np.isscalar(arg): + return str(arg) + else: + return "[" + ",".join(fmt_array(v) for v in arg) + "]" + + +@pytest.mark.parametrize( + ["ndim"], [[1], [2], [3], [4], [5]], ids=lambda x: f"{x}D" +) +@pytest.mark.parametrize( + ["arg"], + [ + (-1000,), + (0,), + (1,), + (1.5,), + (2,), + (1000,), + ([-1000, 0, 1, 1.5, 2, 1000],), + ], + ids=fmt_array, +) +def test_lookupNd(ndim, arg): + lookup = LookupNd(*lookup_args(ndim)) + + args = [arg for _ in range(ndim)] + expected_arr_idx = np.floor([ + np.clip(arg, 0, dim + SMALLEST_DIM_SIZE - 1) for dim in range(ndim) + ]).astype(int) + expected_val = np.ravel_multi_index( + tuple(expected_arr_idx), lookup.cells.shape + ) + + response = lookup(*args) + assert np.all(response == expected_val) + + +@pytest.mark.parametrize(["strict"], [(False,), (True,)]) +def test_lookup2d_roundtrip(strict): + lookup = Lookup2d(*lookup_args(2)) + df = lookup.to_dataframe() + lookup2 = Lookup2d.from_dataframe(df, strict=strict) + assert np.allclose(lookup.cells, lookup2.cells) + for b1, b2 in zip(lookup.boundaries, lookup2.boundaries): + assert np.allclose(b1, b2) + + +def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): + k = 5 + dotprops = [Dotprops(n.nodes[["x", "y", "z"]], k) for n in neurons] + n_orig = len(dotprops) + + # make jittered copies of these neurons + rng = np.random.default_rng(SEED) + jitter_sigma = 50 + matching_sets = [] + for idx, dp in enumerate(dotprops[:]): + dotprops.append( + Dotprops( + dp.points + rng.normal(0, jitter_sigma, dp.points.shape), k + ) + ) + # assign each neuron its jittered self as a match + matching_sets.append({idx, idx + n_orig}) + + # original neurons should all not match each other + nonmatching = list(range(n_orig)) + + # max distance between any 2 points in the data + # for calculating dist boundaries + max_dist = np.linalg.norm( + np.ptp( + np.concatenate([dp.points for dp in dotprops], axis=0), axis=0, + ) + ) + + return LookupDistDotBuilder( + dotprops, + matching_sets, + np.geomspace(10, max_dist, 5)[:-1], + np.linspace(0, 1, 5), + nonmatching, + alpha, + seed=SEED + 1, + ) + + +@pytest.mark.parametrize(["alpha"], [(True,), (False,)]) +@pytest.mark.parametrize(["threads"], [(0,), (2,), (None,)]) +def test_lookupdistdotbuilder_builds(treeneuron_dfs, threads, alpha): + builder = prepare_lookupdistdotbuilder(treeneuron_dfs, alpha) + lookup = builder.build(threads) + # `pytest -rP` to see output + print(lookup.to_dataframe()) From 989c648798b9eca871e9c9bd912224ef6c1c99b1 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 25 Nov 2021 18:40:21 +0000 Subject: [PATCH 04/21] Fix smat tests --- tests/test_nbl/test_smat.py | 45 ++++++++++--------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py index 93959da2..e1d821fd 100644 --- a/tests/test_nbl/test_smat.py +++ b/tests/test_nbl/test_smat.py @@ -4,7 +4,7 @@ import numpy as np from navis.nbl.smat import ( - wrap_bounds, LookupNd, Lookup2d, LookupDistDotBuilder + Digitizer, LookupNd, Lookup2d, LookupDistDotBuilder ) @@ -12,23 +12,6 @@ SEED = 1991 -@pytest.mark.parametrize( - ["arr", "left", "right", "expected"], - [ - ([1, 2], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]), - ([-np.inf, 1, 2, np.inf], -np.inf, np.inf, [-np.inf, 1, 2, np.inf]), - ([0, 1, 2, 3], 0, 3, [-np.inf, 1, 2, np.inf]), - ], -) -def test_wrap_bounds(arr, left, right, expected): - assert np.allclose(wrap_bounds(arr, left, right), expected) - - -def test_wrap_bounds_error(): - with pytest.raises(ValueError): - wrap_bounds([1, 2, 1]) - - def lookup_args(ndim): f""" Create arguments for an ND lookup table. @@ -38,6 +21,7 @@ def lookup_args(ndim): to the size of the array. The boundaries are 0 to the length of the dimension, with the left and rightmost values replaced with -inf and inf respectively. + Examples -------- >>> lookup_args(2) @@ -55,13 +39,8 @@ def lookup_args(ndim): """ shape = tuple(range(SMALLEST_DIM_SIZE, SMALLEST_DIM_SIZE + ndim)) cells = np.arange(np.product(shape)).reshape(shape) - boundaries = [] - for s in shape: - b = np.arange(s + 1, dtype=float) - b[0] = -np.inf - b[-1] = np.inf - boundaries.append(b) - return boundaries, cells + digitizers = [Digitizer(np.arange(s + 1, dtype=float)) for s in shape] + return digitizers, cells def fmt_array(arg): @@ -102,14 +81,14 @@ def test_lookupNd(ndim, arg): assert np.all(response == expected_val) -@pytest.mark.parametrize(["strict"], [(False,), (True,)]) -def test_lookup2d_roundtrip(strict): - lookup = Lookup2d(*lookup_args(2)) +def test_lookup2d_roundtrip(): + digs, cells = lookup_args(2) + lookup = Lookup2d(*digs, cells=cells) df = lookup.to_dataframe() - lookup2 = Lookup2d.from_dataframe(df, strict=strict) + lookup2 = Lookup2d.from_dataframe(df) assert np.allclose(lookup.cells, lookup2.cells) - for b1, b2 in zip(lookup.boundaries, lookup2.boundaries): - assert np.allclose(b1, b2) + for b1, b2 in zip(lookup.digitizers, lookup2.digitizers): + assert b1 == b2 def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): @@ -144,8 +123,8 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): return LookupDistDotBuilder( dotprops, matching_sets, - np.geomspace(10, max_dist, 5)[:-1], - np.linspace(0, 1, 5), + Digitizer.from_geom(10, max_dist, 5), + Digitizer.from_linear(0, 1, 5), nonmatching, alpha, seed=SEED + 1, From f7d0ae181a2bd03256d7b8e230c323ad64cd0721 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 25 Nov 2021 18:40:29 +0000 Subject: [PATCH 05/21] Significant refactor of smat.py --- navis/nbl/smat.py | 372 +++++++++++++++++++++++++++++----------------- 1 file changed, 234 insertions(+), 138 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 01dc6f2a..ca829150 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -4,12 +4,13 @@ import os from collections import Counter from concurrent.futures import ProcessPoolExecutor -from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple +from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple, Union import logging from pathlib import Path from functools import lru_cache from copy import deepcopy import operator +import math import numpy as np import pandas as pd @@ -25,62 +26,13 @@ IMPLICIT_INTERVAL = "[)" fp = Path(__file__).resolve().parent -smat_path = fp / 'score_mats' +smat_path = fp / "score_mats" def chunksize(it_len, cpu_count, min_chunk=50): return max(min_chunk, int(it_len / (cpu_count * 4))) -def wrap_bounds(arr: np.ndarray, left: float = -np.inf, right: float = np.inf): - """Ensures that boundaries cover -inf to inf. - If the left or rightmost values lie within the interval ``(left, right)``, - ``-inf`` or ``inf`` will be prepended or appended respectively. - Otherwise, the left and right values will be set to - ``-inf`` and ``inf`` respectively. - Parameters - ---------- - arr : array-like of floats - Array of boundaries - left : float, optional - If the first value is greater than this, prepend -inf; - otherwise replace that value with -inf. - Defaults to -inf if None. - right : float, optional - If the last value is less than this, append inf; - otherwise replace that value with inf. - Defaults to inf if None. - Returns - ------- - np.ndarray of floats - Boundaries from -inf to inf - Raises - ------ - ValueError - If values of ``arr`` are not monotonically increasing. - """ - if left is None: - left = -np.inf - if right is None: - right = np.inf - - if not np.all(np.diff(arr) > 0): - raise ValueError("Boundaries are not monotonically increasing") - - items = list(arr) - if items[0] <= left: - items[0] = -np.inf - else: - items.insert(0, -np.inf) - - if items[-1] >= right: - items[-1] = np.inf - else: - items.append(np.inf) - - return np.array(items) - - def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any]]: for a, b in pairs: if a != b: @@ -92,7 +44,7 @@ def __init__( self, dotprops, matching_sets, - boundaries: Sequence[Sequence[float]], + digitizers: Sequence[Digitizer], match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]], nonmatching=None, seed=DEFAULT_SEED, @@ -105,15 +57,8 @@ def __init__( as Dotprops objects. matching_sets : list of sets of int Sets of neurons, as indices into ``dotprops``, which should be considered matches. - boundaries : sequence of array-likes of floats - List of lists, where the inner lists are monotonically increasing - from -inf to inf. - The length of the outer list is the dimensionality of the lookup table. - The inner lists are the boundaries of bins for that dimension, - i.e. an inner list of N items describes N-1 bins. - If an inner list is not inf-bounded, - -inf and inf will be prepended and appended. - See the ``wrap_bounds`` convenience function. + digitizers : sequence of Digitizers + The length is the dimensionality of the lookup table. match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]] Function taking 2 arguments, both instances of ``navis.core.neurons.Dotprops``, @@ -136,7 +81,7 @@ def __init__( self.matching_sets = matching_sets self._nonmatching = nonmatching self.match_fn = match_fn - self.boundaries = [wrap_bounds(b) for b in boundaries] + self.digitizers = list(digitizers) self.rng = np.random.default_rng(seed) @@ -162,12 +107,12 @@ def _yield_nonmatching_pairs(self): return yield_not_same(permutations(self.nonmatching, 2)) def _empty_counts(self): - shape = [len(b) - 1 for b in self.boundaries] + shape = [len(b) for b in self.digitizers] return np.zeros(shape, int) def _query_to_idxs(self, q_idx, t_idx, counts=None): response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx]) - idxs = [np.digitize(r, b) - 1 for r, b in zip(response, self.boundaries)] + idxs = [dig(r) for dig, r in zip(self.digitizers, response)] if counts is None: counts = self._empty_counts() @@ -216,7 +161,7 @@ def _pick_nonmatching_pairs(self, n_matching_qual_vals): return nonmatching_pairs - def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: + def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]: matching_pairs = list(set(self._yield_matching_pairs())) # need to know the eventual distdot count # so we know how many non-matching pairs to draw @@ -239,7 +184,7 @@ def _build(self, threads) -> Tuple[List[np.ndarray], np.ndarray]: (match_counts * matching_factor + epsilon) / (nonmatch_counts + epsilon) ) - return self.boundaries, cells + return self.digitizers, cells def build(self, threads=None) -> LookupNd: """Build the score matrix. @@ -265,7 +210,8 @@ def build(self, threads=None) -> LookupNd: ValueError If dist_bins or dot_bins are not set. """ - return LookupNd(*self._build(threads)) + dig, cells = self._build(threads) + return LookupNd(dig, cells) def dist_dot(q: Dotprops, t: Dotprops): @@ -282,8 +228,8 @@ def __init__( self, dotprops, matching_sets, - dist_boundaries, - dot_boundaries, + dist_digitizer: Digitizer, + dot_digitizer: Digitizer, nonmatching=None, use_alpha=False, seed=DEFAULT_SEED, @@ -334,106 +280,256 @@ def __init__( super().__init__( dotprops, matching_sets, - [wrap_bounds(dist_boundaries, 0), wrap_bounds(dot_boundaries, 0, 1)], + [dist_digitizer, dot_digitizer], match_fn, nonmatching, seed, ) def build(self, threads=None) -> Lookup2d: - return Lookup2d(*self._build(threads)) + (dig0, dig1), cells = self._build(threads) + return Lookup2d(dig0, dig1, cells) + + +def is_monotonically_increasing(lst): + for prev_idx, item in enumerate(lst[1:]): + if item <= lst[prev_idx]: + return False + return True + + +def parse_boundary(item: str): + explicit_interval = item[0] + item[-1] + if explicit_interval == "[)": + right = False + elif explicit_interval == "(]": + right = True + else: + raise ValueError( + f"Enclosing characters '{explicit_interval}' do not match a half-open interval" + ) + return tuple(float(i) for i in item[1:-1].split(",")), right + + +class Digitizer: + def __init__( + self, + boundaries: Sequence[float], + clip: Tuple[bool, bool] = (True, True), + right=False, + ): + self.right = right + + boundaries = list(boundaries) + self._min = boundaries[0] + if clip[0]: + boundaries[0] = -math.inf + elif boundaries[0] != -math.inf: + boundaries.insert(0, -math.inf) + + self._max = boundaries[-1] + if clip[1]: + boundaries[-1] = math.inf + elif boundaries[-1] != math.inf: + boundaries.append(math.inf) + + if not is_monotonically_increasing(boundaries): + raise ValueError("Boundaries are not monotonically increasing") + + self.boundaries = np.asarray(boundaries) + + def __len__(self): + return len(self.boundaries) - 1 + + def __call__(self, value: float): + # searchsorted is marginally faster than digitize as it skips monotonicity checks + return ( + np.searchsorted( + self.boundaries, value, side="left" if self.right else "right" + ) + - 1 + ) + + def to_strings(self) -> List[str]: + if self.right: + lb = "(" + rb = "]" + else: + lb = "[" + rb = ")" + + return [ + f"{lb}{lower},{upper}{rb}" + for lower, upper in zip(self.boundaries[:-1], self.boundaries[1:]) + ] + + @classmethod + def from_strings( + cls, bound_strs: Sequence[str] + ): + """Set digitizer boundaries based on a sequence of interval expressions. + + e.g. ``["(0, 1]", "(1, 5]", "(5, 10]"]`` + + The lowermost and uppermost boundaries are converted to -infinity and infinity respectively. + + Parameters + ---------- + bound_strs : Sequence[str] + Strings representing intervals, which must abut and have open/closed boundaries + specified by brackets. + + Returns + ------- + Digitizer + """ + bounds: List[float] = [] + last_upper = None + last_right = None + for item in bound_strs: + (lower, upper), right = parse_boundary(item) + bounds.append(float(lower)) + + if last_right is not None: + if right != last_right: + raise ValueError("Inconsistent half-open interval") + else: + last_right = right + + if last_upper is not None: + if lower != last_upper: + raise ValueError("Half-open intervals do not abut") + + last_upper = upper + + bounds.append(float(last_upper)) + return cls(bounds, right=last_right) + + @classmethod + def from_linear(cls, lower: float, upper: float, nbins: int, right=False): + """Choose digitizer boundaries spaced linearly between two values. + + Parameters + ---------- + lower : float + Lowest value + upper : float + Highest value + nbins : int + Number of bins + right : bool, optional + Whether bins should include their right (rather than left) boundary, + by default False + + Returns + ------- + Digitizer + """ + arr = np.linspace(lower, upper, nbins + 1, endpoint=True) + return cls(arr, right=right) + + @classmethod + def from_geom(cls, lowest_upper: float, upper: float, nbins: int, right=False): + """Choose digitizer boundaries in a geometric sequence. + + Parameters + ---------- + lowest_upper : float + Upper bound of the lowest bin. + The lower bound of the lowest bin is often 0, which cannot be represented in a nontrivial geometric sequence. + upper : float + Upper bound of the highest bin. + This will be replaced by +inf, but the value is used for setting the rest of the sequence. + nbins : int + Number of bins + right : bool, optional + Whether bins should include their right (rather than left) boundary, + by default False + + Returns + ------- + Digitizer + """ + # Because geom can't start at 0, instead start at the upper bound + # of the lowest bin, then prepend -inf. + # Because the extra bin is added, do not need to add 1 to nbins in geomspace. + arr = np.geomspace(lowest_upper, upper, nbins, True) + return cls(arr, clip=(False, True), right=right) + + @classmethod + def from_data(cls, data: Sequence[float], nbins: int, right=False): + """Choose digitizer boundaries to evenly partition the given values. + + Parameters + ---------- + data : Sequence[float] + Data which should be evenly partitioned by the resulting digitizer. + nbins : int + Number of bins + right : bool, optional + Whether bins should include their right (rather than left) boundary, + by default False + + Returns + ------- + Digitizer + """ + arr = np.quantile(data, np.linspace(0, 1, nbins + 1, True)) + return cls(arr, right=right) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Digitizer): + return NotImplemented + return self.right == other.right and np.allclose(self.boundaries, other.boundaries) class LookupNd: - def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray): - if [len(b) - 1 for b in boundaries] != list(cells.shape): + def __init__(self, digitizers: List[Digitizer], cells: np.ndarray): + if [len(b) for b in digitizers] != list(cells.shape): raise ValueError("boundaries and cells have inconsistent bin counts") - self.boundaries = boundaries + self.digitizers = digitizers self.cells = cells def __call__(self, *args): - if len(args) != len(self.boundaries): + if len(args) != len(self.digitizers): raise TypeError( - f"Lookup takes {len(self.boundaries)} arguments but {len(args)} were given" + f"Lookup takes {len(self.digitizers)} arguments but {len(args)} were given" ) - idxs = tuple( - np.digitize(r, b) - 1 for r, b in zip(args, self.boundaries) - ) + idxs = tuple(d(arg) for d, arg in zip(self.digitizers, args)) out = self.cells[idxs] return out -def format_boundaries(arr): - return [f"[{lower},{upper})" for lower, upper in zip(arr[:-1], arr[1:])] - - -def parse_boundary(item, strict=False): - explicit_interval = item[0] + item[-1] - if strict and item[0] + item[-1] != IMPLICIT_INTERVAL: - raise ValueError( - f"Enclosing characters {explicit_interval} " - f"do not match implicit interval specifiers {IMPLICIT_INTERVAL}" - ) - return tuple(float(i) for i in item[1:-1].split(",")) - - -def parse_boundaries(items, strict=False): - # declaring upper first and then checking for None - # pleases the type checker, and handles the case where - # len(items) == 0 - upper = None - for item in items: - lower, upper = parse_boundary(item, strict) - yield lower - if upper is None: - return - yield upper - - class Lookup2d(LookupNd): """Convenience class inheriting from LookupNd for the common 2D case. Provides IO with pandas DataFrames. """ - def __init__(self, boundaries: List[np.ndarray], cells: np.ndarray): - if len(boundaries) != 2: - raise ValueError("boundaries must be of length 2; cells must be 2D") - super().__init__(boundaries, cells) + def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarray): + super().__init__([digitizer0, digitizer1], cells) def to_dataframe(self) -> pd.DataFrame: - # numpy.digitize includes left, excludes right, i.e. "[left,right)" return pd.DataFrame( self.cells, - format_boundaries(self.boundaries[0]), - format_boundaries(self.boundaries[1]), + self.digitizers[0].to_strings(), + self.digitizers[1].to_strings(), ) @classmethod - def from_dataframe(cls, df: pd.DataFrame, strict=False): + def from_dataframe(cls, df: pd.DataFrame): f"""Parse score matrix from a dataframe with string index and column labels. + Expects the index and column labels to specify an interval like ``f"[{{lower}},{{upper}})"``. Will replace the lowermost and uppermost bound with -inf and inf if they are not already. - Parameters - ---------- - strict : bool, optional - If falsey (default), ignores parentheses and brackets, - effectively normalising to - ``{IMPLICIT_INTERVAL[0]}lower,upper{IMPLICIT_INTERVAL[1]}`` - as an implementation detail. - Otherwise, raises a ValueError if parens/brackets - do not match the implementation. """ - boundaries = [] - for arr in (df.index, df.columns): - b = np.array(list(parse_boundaries(arr, strict))) - b[0] = -np.inf - b[-1] = np.inf - boundaries.append(b) - - return cls(boundaries, df.to_numpy()) + return cls( + Digitizer.from_strings(df.index), + Digitizer.from_strings(df.columns), + df.to_numpy(), + ) @lru_cache(maxsize=None) @@ -444,9 +540,7 @@ def _smat_fcwb(alpha=False): fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha] fpath = smat_path / fname - return Lookup2d.from_dataframe( - pd.read_csv(fpath, index_col=0) - ) + return Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0)) def smat_fcwb(alpha=False): @@ -489,7 +583,7 @@ def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True): SCORE_FN_DESCR = """ -NBLAST score functions take 2 floats or numpy arrays of floats of length N +NBLAST score functions take 2 floats or N-length numpy arrays of floats (for matched dotprop points/tangents, distance and dot product; the latter possibly scaled by the geometric mean of the alpha colinearity values) and returns a float or N-length numpy array of floats. @@ -510,7 +604,7 @@ def parse_score_fn(smat, alpha=False): Also checks the signature of the callable. Raises an error, probably a ValueError, if it can't be interpreted. alpha : optional bool, default False - If ``smat`` is None, choose whether to use the FCWB matrices + If ``smat`` is ``"auto"``, choose whether to use the FCWB matrices with or without alpha. Returns ------- @@ -522,7 +616,7 @@ def parse_score_fn(smat, alpha=False): """ if smat is None: smat = operator.mul - elif smat == 'auto': + elif smat == "auto": smat = smat_fcwb(alpha) if isinstance(smat, (str, os.PathLike)): @@ -532,7 +626,9 @@ def parse_score_fn(smat, alpha=False): smat = Lookup2d.from_dataframe(smat) if not callable(smat): - raise ValueError("smat should be a callable, a path, a pandas.DataFrame, or 'auto'") + raise ValueError( + "smat should be a callable, a path, a pandas.DataFrame, or 'auto'" + ) check_score_fn(smat) From 7ad57322f431480a89d1a0f3642ce52b2461eac9 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 26 Nov 2021 14:31:38 +0000 Subject: [PATCH 06/21] smat: calculate digitizers from data --- navis/nbl/smat.py | 259 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 181 insertions(+), 78 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index ca829150..b30befc8 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -4,13 +4,25 @@ import os from collections import Counter from concurrent.futures import ProcessPoolExecutor -from typing import Iterator, Sequence, Callable, List, Iterable, Any, Tuple, Union +from typing import ( + Hashable, + Iterator, + Mapping, + Optional, + Sequence, + Callable, + List, + Iterable, + Any, + Tuple, +) import logging from pathlib import Path from functools import lru_cache from copy import deepcopy import operator import math +from collections import defaultdict import numpy as np import pandas as pd @@ -39,26 +51,43 @@ def yield_not_same(pairs: Iterable[Tuple[Any, Any]]) -> Iterator[Tuple[Any, Any] yield a, b +def concat_results(results: Iterable[List[np.ndarray]]) -> List[np.ndarray]: + intermediate = defaultdict(list) + for result_lst in results: + for idx, array in enumerate(result_lst): + intermediate[idx].append(array) + + return [np.concatenate(arrs) for arrs in intermediate.values()] + + +DotpropKey = Hashable + + class LookupNdBuilder: def __init__( self, - dotprops, - matching_sets, - digitizers: Sequence[Digitizer], + dotprops: Mapping[DotpropKey, Dotprops], + matching_lists: List[List[DotpropKey]], match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]], - nonmatching=None, - seed=DEFAULT_SEED, + nonmatching_list: Optional[List[DotpropKey]] = None, + seed: int = DEFAULT_SEED, ) -> None: f"""Class for building an N-dimensional score lookup for NBLAST. + + Once instantiated, the axes of the lookup table must be defined. + Call ``.with_digitizers()`` to manually define them, + or ``.with_bin_counts()`` to learn them from the matched-pair data. + + Then call ``.build()`` to build the lookup table. + Parameters ---------- + dotprops : dict or list of Dotprops - An indexable sequence of all neurons which will be used as the training set, - as Dotprops objects. - matching_sets : list of sets of int - Sets of neurons, as indices into ``dotprops``, which should be considered matches. - digitizers : sequence of Digitizers - The length is the dimensionality of the lookup table. + An indexable, consistently-ordered sequence of all neurons + which will be used as the training set, as Dotprops objects. + matching_sets : list of lists of index into dotprops + Lists of neurons, as indices into ``dotprops``, which should be considered matches. match_fn : Callable[[Dotprops, Dotprops], List[np.ndarray[float]]] Function taking 2 arguments, both instances of ``navis.core.neurons.Dotprops``, @@ -68,8 +97,8 @@ def __init__( as the number of points in the first argument. This function returns values describing the quality of point matches from a query to a target neuron. - nonmatching : set of int, optional - Set of neurons, as indices into ``dotprops``, + nonmatching : list of index into dotprops, optional + List of neurons, as indices into ``dotprops``, which should not be considered matches. If not given, all ``dotprops`` will be used (on the assumption that matches are a small subset of possible pairs). @@ -78,41 +107,106 @@ def __init__( by default {DEFAULT_SEED} """ self.dotprops = dotprops - self.matching_sets = matching_sets - self._nonmatching = nonmatching + self.matching_lists = matching_lists + self._nonmatching_list = nonmatching_list self.match_fn = match_fn - self.digitizers = list(digitizers) + + self.digitizers: Optional[List[Digitizer]] = None + self.bin_counts: Optional[List[int]] = None self.rng = np.random.default_rng(seed) - def _dotprop_keys(self): + def with_digitizers(self, digitizers: List[Digitizer]): + """Specify the axes of the output lookup table directly. + + Parameters + ---------- + digitizers : List[Digitizer] + + Returns + ------- + self + For chaining convenience. + """ + self.digitizers = digitizers + return self + + def with_bin_counts(self, bin_counts: List[int]): + """Specify the number of bins on each axis of the output lookup table. + + The bin boundaries will be determined by evenly partitioning the data + from the matched pairs into quantiles, in each dimension. + + Parameters + ---------- + bin_counts : List[int] + + Returns + ------- + self + For chaining convenience. + """ + self.bin_counts = bin_counts + return self + + def _dotprop_keys(self) -> Sequence[DotpropKey]: + """Get all indices into dotprops instance member""" try: return self.dotprops.keys() except AttributeError: return range(len(self.dotprops)) @property - def nonmatching(self): + def nonmatching(self) -> List[DotpropKey]: """Indices of nonmatching set of neurons""" - if self._nonmatching is None: - return set(self._dotprop_keys()) - return self._nonmatching + if self._nonmatching_list is None: + return list(self._dotprop_keys()) + return self._nonmatching_list - def _yield_matching_pairs(self): - for ms in self.matching_sets: + def _yield_matching_pairs(self) -> Iterator[Tuple[DotpropKey, DotpropKey]]: + """Yield all index pairs within all matching pairs""" + for ms in self.matching_lists: yield from yield_not_same(permutations(ms, 2)) - def _yield_nonmatching_pairs(self): + def _yield_nonmatching_pairs(self) -> Iterator[Tuple[DotpropKey, DotpropKey]]: + """Yield all index pairs within nonmatching list""" # todo: this could be much better, use meshgrid or shuffle index arrays return yield_not_same(permutations(self.nonmatching, 2)) - def _empty_counts(self): + def _empty_counts(self) -> np.ndarray: + """Create an empty array in which to store counts; shape determined by digitizer sizes.""" shape = [len(b) for b in self.digitizers] return np.zeros(shape, int) + def _query(self, q_idx, t_idx) -> List[np.ndarray]: + """Get the results of applying the match function to dotprops specified by indices""" + return self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx]) + + def _query_many(self, idx_pairs, threads=None) -> Iterator[List[np.ndarray]]: + """Yield results from querying many pairs of dotprop indices""" + if threads is None or threads == 0 and cpu_count == 1: + for q_idx, t_idx in idx_pairs: + yield self._query(q_idx, t_idx) + + threads = threads or cpu_count + idx_pairs = np.asarray(idx_pairs) + chunks = chunksize(len(idx_pairs), threads) + + with ProcessPoolExecutor(threads) as exe: + yield from exe.map( + self._query, idx_pairs[:, 0], idx_pairs[:, 1], chunksize=chunks + ) + def _query_to_idxs(self, q_idx, t_idx, counts=None): - response = self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx]) - idxs = [dig(r) for dig, r in zip(self.digitizers, response)] + """Produce a digitized counts array from a given query-target pair""" + return self._count_results(self._query(q_idx, t_idx), counts) + + def _count_results(self, results: List[np.ndarray], counts=None): + """Convert raw match function ouput into a digitized counts array. + + Requires digitizers. + """ + idxs = [dig(r) for dig, r in zip(self.digitizers, results)] if counts is None: counts = self._empty_counts() @@ -123,6 +217,10 @@ def _query_to_idxs(self, q_idx, t_idx, counts=None): return counts def _counts_array(self, idx_pairs, threads=None): + """Convert index pairs into a digitized counts array. + + Requires digitizers. + """ counts = self._empty_counts() if threads is None or threads == 0 and cpu_count == 1: for q_idx, t_idx in idx_pairs: @@ -133,6 +231,8 @@ def _counts_array(self, idx_pairs, threads=None): idx_pairs = np.asarray(idx_pairs, dtype=int) chunks = chunksize(len(idx_pairs), threads) + # because digitizing is not necessarily free, + # keep this parallelisation separate to that in _query_many with ProcessPoolExecutor(threads) as exe: for these_counts in exe.map( self._query_to_idxs, @@ -145,6 +245,7 @@ def _counts_array(self, idx_pairs, threads=None): return counts def _pick_nonmatching_pairs(self, n_matching_qual_vals): + """Using the seeded RNG, pick which nonmatching pairs to use.""" # pre-calculating which pairs we're going to use, # rather than drawing them as we need them, # means that we can parallelise the later step more effectively. @@ -162,6 +263,11 @@ def _pick_nonmatching_pairs(self, n_matching_qual_vals): return nonmatching_pairs def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]: + if self.digitizers is None and self.bin_counts is None: + raise ValueError( + "Builder needs either digitizers or bin_counts; see with_* methods" + ) + matching_pairs = list(set(self._yield_matching_pairs())) # need to know the eventual distdot count # so we know how many non-matching pairs to draw @@ -171,8 +277,16 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]: ) nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals) + if self.digitizers: + match_counts = self._counts_array(matching_pairs, threads) + else: + match_results = concat_results(self._query_many(matching_pairs, threads)) + self.digitizers = [ + Digitizer.from_data(data, nbins) + for data, nbins in zip(match_results, self.bin_counts) + ] + match_counts = self._count_results(match_results) - match_counts = self._counts_array(matching_pairs, threads) nonmatch_counts = self._counts_array(nonmatching_pairs, threads) # account for there being different total numbers of match/nonmatch dist dots @@ -188,27 +302,26 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]: def build(self, threads=None) -> LookupNd: """Build the score matrix. + All non-identical neuron pairs within all matching sets are selected, and distdots calculated for those pairs. Then, the minimum number of non-matching pairs are randomly drawn so that at least as many distdots can be calculated for non-matching pairs. + In each bin of the score matrix, the log2 odds ratio of a distdot in that bin belonging to a match vs. non-match is calculated. + Parameters ---------- threads : int, optional If None, act in serial. If 0, use cpu_count - 1. Otherwise, use the given value. + Returns ------- - pd.DataFrame - Suitable for passing to navis.nbl.ScoringFunction - Raises - ------ - ValueError - If dist_bins or dot_bins are not set. + LookupNd """ dig, cells = self._build(threads) return LookupNd(dig, cells) @@ -226,45 +339,27 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops): class LookupDistDotBuilder(LookupNdBuilder): def __init__( self, - dotprops, - matching_sets, - dist_digitizer: Digitizer, - dot_digitizer: Digitizer, - nonmatching=None, - use_alpha=False, - seed=DEFAULT_SEED, + dotprops: Mapping[DotpropKey, Dotprops], + matching_lists: List[List[DotpropKey]], + nonmatching_list: Optional[List[DotpropKey]] = None, + use_alpha: bool = False, + seed: int = DEFAULT_SEED, ): f"""Class for building a 2-dimensional score lookup for NBLAST. The scores are 1. The distances between best-matching points 2. The dot products of direction vectors around those points, optionally scaled by the colinearity ``alpha``. + Parameters ---------- dotprops : dict or list of Dotprops An indexable sequence of all neurons which will be used as the training set, as Dotprops objects. - matching_sets : list of sets of int - Sets of neurons, as indices into ``dotprops``, which should be considered matches. - dist_boundaries : array-like of floats - Monotonically increasing boundaries between distance bins - (i.e. N values makes N-1 bins). - If the first value > 0, -inf will be prepended, - otherwise the first value will be replaced with -inf. - inf will be appended unless the last value is already inf. - Consider using ``numpy.logspace`` or ``numpy.geomspace`` - to generate the inner boundaries of the bins. - dot_boundaries : array-like of floats - Monotonically increasing boundaries between - (possibly alpha-scaled) dot product bins - (i.e. N values makes N-1 bins). - If the first value > 0, -inf will be prepended, - otherwise the first value will be replaced with -inf. - If the last value < 1, inf will be appended, - otherwise the last value will be replaced with inf. - Consider using ``numpy.linspace`` between 0 and 1. - nonmatching : set of int, optional - Set of neurons, as indices into ``dotprops``, + matching_lists : list of lists of indices into dotprops + List of neurons, as indices into ``dotprops``, which should be considered matches. + nonmatching_list : list of indices into dotprops, optional + List of neurons, as indices into ``dotprops``, which should not be considered matches. If not given, all ``dotprops`` will be used (on the assumption that matches are a small subset of possible pairs). @@ -279,14 +374,17 @@ def __init__( match_fn = dist_dot_alpha if use_alpha else dist_dot super().__init__( dotprops, - matching_sets, - [dist_digitizer, dot_digitizer], + matching_lists, match_fn, - nonmatching, + nonmatching_list, seed, ) def build(self, threads=None) -> Lookup2d: + if (self.digitizers and len(self.digitizers) != 2) or ( + self.bin_counts and len(self.bin_counts) != 2 + ): + raise ValueError("Expected 2 axes") (dig0, dig1), cells = self._build(threads) return Lookup2d(dig0, dig1, cells) @@ -364,9 +462,7 @@ def to_strings(self) -> List[str]: ] @classmethod - def from_strings( - cls, bound_strs: Sequence[str] - ): + def from_strings(cls, interval_strs: Sequence[str]): """Set digitizer boundaries based on a sequence of interval expressions. e.g. ``["(0, 1]", "(1, 5]", "(5, 10]"]`` @@ -386,7 +482,7 @@ def from_strings( bounds: List[float] = [] last_upper = None last_right = None - for item in bound_strs: + for item in interval_strs: (lower, upper), right = parse_boundary(item) bounds.append(float(lower)) @@ -409,6 +505,8 @@ def from_strings( def from_linear(cls, lower: float, upper: float, nbins: int, right=False): """Choose digitizer boundaries spaced linearly between two values. + Input values will be clipped to fit within the given interval. + Parameters ---------- lower : float @@ -429,17 +527,18 @@ def from_linear(cls, lower: float, upper: float, nbins: int, right=False): return cls(arr, right=right) @classmethod - def from_geom(cls, lowest_upper: float, upper: float, nbins: int, right=False): + def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False): """Choose digitizer boundaries in a geometric sequence. + Additional bins will be added above and below the given values. + Parameters ---------- lowest_upper : float Upper bound of the lowest bin. The lower bound of the lowest bin is often 0, which cannot be represented in a nontrivial geometric sequence. - upper : float - Upper bound of the highest bin. - This will be replaced by +inf, but the value is used for setting the rest of the sequence. + highest_lower : float + Lower bound of the highest bin. nbins : int Number of bins right : bool, optional @@ -453,8 +552,8 @@ def from_geom(cls, lowest_upper: float, upper: float, nbins: int, right=False): # Because geom can't start at 0, instead start at the upper bound # of the lowest bin, then prepend -inf. # Because the extra bin is added, do not need to add 1 to nbins in geomspace. - arr = np.geomspace(lowest_upper, upper, nbins, True) - return cls(arr, clip=(False, True), right=right) + arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True) + return cls(arr, clip=False, right=right) @classmethod def from_data(cls, data: Sequence[float], nbins: int, right=False): @@ -480,7 +579,9 @@ def from_data(cls, data: Sequence[float], nbins: int, right=False): def __eq__(self, other: object) -> bool: if not isinstance(other, Digitizer): return NotImplemented - return self.right == other.right and np.allclose(self.boundaries, other.boundaries) + return self.right == other.right and np.allclose( + self.boundaries, other.boundaries + ) class LookupNd: @@ -587,7 +688,9 @@ def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True): (for matched dotprop points/tangents, distance and dot product; the latter possibly scaled by the geometric mean of the alpha colinearity values) and returns a float or N-length numpy array of floats. -""".strip().replace("\n", " ") +""".strip().replace( + "\n", " " +) def parse_score_fn(smat, alpha=False): From 902ed7401409cdfef283ec2560c9721c2ddd25d2 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 26 Nov 2021 14:52:19 +0000 Subject: [PATCH 07/21] Fix tests, finish docs --- navis/nbl/smat.py | 25 +++++++++++++++++++------ tests/test_nbl/test_smat.py | 13 +++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index b30befc8..29a62162 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -15,6 +15,7 @@ Iterable, Any, Tuple, + Union, ) import logging from pathlib import Path @@ -66,7 +67,7 @@ def concat_results(results: Iterable[List[np.ndarray]]) -> List[np.ndarray]: class LookupNdBuilder: def __init__( self, - dotprops: Mapping[DotpropKey, Dotprops], + dotprops: Union[List[Dotprops], Mapping[DotpropKey, Dotprops]], matching_lists: List[List[DotpropKey]], match_fn: Callable[[Dotprops, Dotprops], List[np.ndarray]], nonmatching_list: Optional[List[DotpropKey]] = None, @@ -339,7 +340,7 @@ def dist_dot_alpha(q: Dotprops, t: Dotprops): class LookupDistDotBuilder(LookupNdBuilder): def __init__( self, - dotprops: Mapping[DotpropKey, Dotprops], + dotprops: Union[List[Dotprops], Mapping[DotpropKey, Dotprops]], matching_lists: List[List[DotpropKey]], nonmatching_list: Optional[List[DotpropKey]] = None, use_alpha: bool = False, @@ -416,6 +417,21 @@ def __init__( clip: Tuple[bool, bool] = (True, True), right=False, ): + """Class converting continuous values into discrete indices given specific bin boundaries. + + Parameters + ---------- + boundaries : Sequence[float] + N boundaries specifying N-1 bins. + Must be monotonically increasing. + clip : Tuple[bool, bool], optional + Whether to set the bottom and top boundaries to -infinity and infinity respectively, + effectively clipping incoming values: by default (True, True). + False means "add a new bin for out-of-range values". + right : bool, optional + Whether bins should include their right (rather than left) boundary, + by default False + """ self.right = right boundaries = list(boundaries) @@ -549,11 +565,8 @@ def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right= ------- Digitizer """ - # Because geom can't start at 0, instead start at the upper bound - # of the lowest bin, then prepend -inf. - # Because the extra bin is added, do not need to add 1 to nbins in geomspace. arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True) - return cls(arr, clip=False, right=right) + return cls(arr, clip=(False, False), right=right) @classmethod def from_data(cls, data: Sequence[float], nbins: int, right=False): diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py index e1d821fd..76d7f9fb 100644 --- a/tests/test_nbl/test_smat.py +++ b/tests/test_nbl/test_smat.py @@ -99,7 +99,7 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): # make jittered copies of these neurons rng = np.random.default_rng(SEED) jitter_sigma = 50 - matching_sets = [] + matching_lists = [] for idx, dp in enumerate(dotprops[:]): dotprops.append( Dotprops( @@ -107,7 +107,7 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): ) ) # assign each neuron its jittered self as a match - matching_sets.append({idx, idx + n_orig}) + matching_lists.append([idx, idx + n_orig]) # original neurons should all not match each other nonmatching = list(range(n_orig)) @@ -122,13 +122,14 @@ def prepare_lookupdistdotbuilder(neurons, alpha=False, k=5): return LookupDistDotBuilder( dotprops, - matching_sets, - Digitizer.from_geom(10, max_dist, 5), - Digitizer.from_linear(0, 1, 5), + matching_lists, nonmatching, alpha, seed=SEED + 1, - ) + ).with_digitizers([ + Digitizer.from_geom(10, max_dist, 5), + Digitizer.from_linear(0, 1, 5), + ]) @pytest.mark.parametrize(["alpha"], [(True,), (False,)]) From fa87ae32616aa0e1b4f5863e50469288d612c792 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Fri, 26 Nov 2021 18:08:38 +0000 Subject: [PATCH 08/21] smat: cache match results --- .gitignore | 2 ++ navis/nbl/smat.py | 69 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 53 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index da2d03ca..d35c0bea 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,5 @@ ENV/ #*.swc #*.json MANIFEST + +tmp/ diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 29a62162..f27fd96e 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -115,7 +115,16 @@ def __init__( self.digitizers: Optional[List[Digitizer]] = None self.bin_counts: Optional[List[int]] = None - self.rng = np.random.default_rng(seed) + self.seed = seed + self._ndim: Optional[int] = None + + @property + def ndim(self) -> int: + if self._ndim is None: + idx1, idx2 = self._dotprop_keys()[:2] + self._ndim = len(self._query(idx1, idx2)) + self._query.cache_clear() + return self._ndim def with_digitizers(self, digitizers: List[Digitizer]): """Specify the axes of the output lookup table directly. @@ -129,7 +138,14 @@ def with_digitizers(self, digitizers: List[Digitizer]): self For chaining convenience. """ + if len(digitizers) != self.ndim: + raise ValueError( + f"Match function returns {self.ndim} values " + f"but provided {len(digitizers)} digitizers" + ) + self.digitizers = digitizers + self.bin_counts = None return self def with_bin_counts(self, bin_counts: List[int]): @@ -147,7 +163,14 @@ def with_bin_counts(self, bin_counts: List[int]): self For chaining convenience. """ + if len(bin_counts) != self.ndim: + raise ValueError( + f"Match function returns {self.ndim} values " + f"but provided {len(bin_counts)} bin counts" + ) + self.bin_counts = bin_counts + self.digitizers = None return self def _dotprop_keys(self) -> Sequence[DotpropKey]: @@ -179,6 +202,7 @@ def _empty_counts(self) -> np.ndarray: shape = [len(b) for b in self.digitizers] return np.zeros(shape, int) + @lru_cache def _query(self, q_idx, t_idx) -> List[np.ndarray]: """Get the results of applying the match function to dotprops specified by indices""" return self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx]) @@ -217,7 +241,7 @@ def _count_results(self, results: List[np.ndarray], counts=None): return counts - def _counts_array(self, idx_pairs, threads=None): + def _counts_array(self, idx_pairs, threads=None, cache=False): """Convert index pairs into a digitized counts array. Requires digitizers. @@ -242,6 +266,8 @@ def _counts_array(self, idx_pairs, threads=None): chunksize=chunks, ): counts += these_counts + if not cache: + self._query.cache_clear return counts @@ -255,20 +281,16 @@ def _pick_nonmatching_pairs(self, n_matching_qual_vals): all_nonmatching_pairs = list(self._yield_nonmatching_pairs()) nonmatching_pairs = [] n_nonmatching_qual_vals = 0 + rng = np.random.default_rng(self.seed) while n_nonmatching_qual_vals < n_matching_qual_vals: - idx = self.rng.integers(0, len(all_nonmatching_pairs)) + idx = rng.integers(0, len(all_nonmatching_pairs)) nonmatching_pair = all_nonmatching_pairs.pop(idx) nonmatching_pairs.append(nonmatching_pair) n_nonmatching_qual_vals += len(self.dotprops[nonmatching_pair[0]]) return nonmatching_pairs - def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]: - if self.digitizers is None and self.bin_counts is None: - raise ValueError( - "Builder needs either digitizers or bin_counts; see with_* methods" - ) - + def _get_pairs(self): matching_pairs = list(set(self._yield_matching_pairs())) # need to know the eventual distdot count # so we know how many non-matching pairs to draw @@ -278,14 +300,26 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]: ) nonmatching_pairs = self._pick_nonmatching_pairs(n_matching_qual_vals) + return matching_pairs, nonmatching_pairs + + def _build(self, threads, cache=False) -> Tuple[List[Digitizer], np.ndarray]: + if self.digitizers is None and self.bin_counts is None: + raise ValueError( + "Builder needs either digitizers or bin_counts; see with_* methods" + ) + + matching_pairs, nonmatching_pairs = self._get_pairs() + if self.digitizers: - match_counts = self._counts_array(matching_pairs, threads) + match_counts = self._counts_array(matching_pairs, threads, cache) else: match_results = concat_results(self._query_many(matching_pairs, threads)) self.digitizers = [ Digitizer.from_data(data, nbins) for data, nbins in zip(match_results, self.bin_counts) ] + if not cache: + self._query.cache_clear() match_counts = self._count_results(match_results) nonmatch_counts = self._counts_array(nonmatching_pairs, threads) @@ -301,7 +335,7 @@ def _build(self, threads) -> Tuple[List[Digitizer], np.ndarray]: return self.digitizers, cells - def build(self, threads=None) -> LookupNd: + def build(self, threads=None, cache=False) -> LookupNd: """Build the score matrix. All non-identical neuron pairs within all matching sets are selected, @@ -324,7 +358,7 @@ def build(self, threads=None) -> LookupNd: ------- LookupNd """ - dig, cells = self._build(threads) + dig, cells = self._build(threads, cache) return LookupNd(dig, cells) @@ -380,13 +414,10 @@ def __init__( nonmatching_list, seed, ) + self._ndim = 2 - def build(self, threads=None) -> Lookup2d: - if (self.digitizers and len(self.digitizers) != 2) or ( - self.bin_counts and len(self.bin_counts) != 2 - ): - raise ValueError("Expected 2 axes") - (dig0, dig1), cells = self._build(threads) + def build(self, threads=None, cache=False) -> Lookup2d: + (dig0, dig1), cells = self._build(threads, cache) return Lookup2d(dig0, dig1, cells) @@ -664,6 +695,7 @@ def smat_fcwb(alpha=False): def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True): """Checks functionally that the callable can be used as a score function. + Parameters ---------- nargs : optional int, default 2 @@ -672,6 +704,7 @@ def check_score_fn(fn: Callable, nargs=2, scalar=True, array=True): Check that the function can be used on ``nargs`` scalars. array : optional bool, default True Check that the function can be used on ``nargs`` 1D ``numpy.ndarray``s. + Raises ------ ValueError From 56d780ce35c2df6f98fd6d0f2202c3db8514c2c8 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Mon, 29 Nov 2021 15:30:39 +0000 Subject: [PATCH 09/21] Switch Blaster.append to return numerical indices --- navis/nbl/base.py | 9 ++++++++- navis/nbl/nblast_funcs.py | 18 ++++++++---------- navis/nbl/synblast_funcs.py | 13 +++++++------ 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/navis/nbl/base.py b/navis/nbl/base.py index c3ad4203..1b238103 100644 --- a/navis/nbl/base.py +++ b/navis/nbl/base.py @@ -17,6 +17,7 @@ import pandas as pd from abc import ABC, abstractmethod +from typing import Union, List from .. import utils, config @@ -24,6 +25,9 @@ FLOAT_DTYPES = {16: np.float16, 32: np.float32, 64: np.float64, None: None} +AppendOutput = Union[int, List['AppendOutput']] + + class Blaster(ABC): """Base class for blasting.""" @@ -37,7 +41,7 @@ def __init__(self, dtype=np.float64, progress=True): self.ids = [] @abstractmethod - def append(self, neurons): + def append(self, neurons) -> AppendOutput: """Append neurons.""" pass @@ -157,3 +161,6 @@ def all_by_all(self, scores='forward'): res.loc[:, :] = np.dstack((res, res.T)).max(axis=2) return res + + def __len__(self): + return len(self.neurons) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index 32dae6fa..a6b28271 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -16,7 +16,6 @@ import numbers import os -import uuid import numpy as np import pandas as pd @@ -27,7 +26,7 @@ from .. import utils, config from ..core import NeuronList, Dotprops, make_dotprops -from .base import Blaster +from .base import Blaster, AppendOutput __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist'] @@ -104,8 +103,6 @@ def parse_interval(self, s): return float(s.strip("([])").split(",")[-1]) -NeuronId = Union[int, str, uuid.UUID] - class NBlaster(Blaster): """Implements version 2 of the NBLAST algorithm. @@ -163,12 +160,12 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', else: self.distance_upper_bound = limit_dist - def append(self, dotprops) -> Union[List[NeuronId], NeuronId]: + def append(self, dotprops) -> AppendOutput: """Append dotprops. - Returns the ID of the appended dotprops. + Returns the numerical index appended dotprops. If dotprops is a (possibly nested) sequence of dotprops, - return a (possibly nested) list of IDs. + return a (possibly nested) list of indices. """ if isinstance(dotprops, Dotprops): return self._append_dotprops(dotprops) @@ -178,12 +175,13 @@ def append(self, dotprops) -> Union[List[NeuronId], NeuronId]: except TypeError: # i.e. not iterable raise ValueError(f"Expected Dotprops or iterable thereof; got {type(dotprops)}") - def _append_dotprops(self, dotprops: Dotprops) -> NeuronId: + def _append_dotprops(self, dotprops: Dotprops) -> int: + next_id = len(self) self.neurons.append(dotprops) self.ids.append(dotprops.id) # Calculate score for self hit self.self_hits.append(self.calc_self_hit(dotprops)) - return dotprops.id + return next_id def calc_self_hit(self, dotprops): """Non-normalized value for self hit.""" @@ -195,7 +193,7 @@ def calc_self_hit(self, dotprops): dots = np.repeat(1, len(dotprops.points)) * np.sqrt(alpha) return self.score_fn(dists, dots).sum() - def single_query_target(self, q_idx, t_idx, scores='forward'): + def single_query_target(self, q_idx: int, t_idx: int, scores='forward'): """Query single target against single target.""" # Take a short-cut if this is a self-self comparison if q_idx == t_idx: diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py index c0b7812d..821e617e 100644 --- a/navis/nbl/synblast_funcs.py +++ b/navis/nbl/synblast_funcs.py @@ -27,7 +27,7 @@ from .. import config, utils from ..core import NeuronList, BaseNeuron -from .base import Blaster +from .base import Blaster, AppendOutput from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction, nblast_preflight) @@ -84,8 +84,8 @@ def __init__(self, normalized=True, by_type=True, self.score_fn = ScoringFunction(smat) self.ids = [] - def append(self, neuron, id=None): - """Append neurons/connector tables, returning ids of added objects""" + def append(self, neuron, id=None) -> AppendOutput: + """Append neurons/connector tables, returning numerical indices of added objects""" if isinstance(neuron, pd.DataFrame): return self._append_connectors(neuron, id) @@ -102,10 +102,11 @@ def append(self, neuron, id=None): f"{type(neuron)}" ) - def _append_connectors(self, connectors: pd.DataFrame, id): + def _append_connectors(self, connectors: pd.DataFrame, id) -> int: if id is None: raise ValueError("Explicit non-None id required for appending connectors") + next_idx = len(self) self.ids.append(id) self.neurons.append({}) if not self.by_type: @@ -123,13 +124,13 @@ def _append_connectors(self, connectors: pd.DataFrame, id): # Calculate score for self hit self.self_hits.append(self.calc_self_hit(connectors)) - return id + return next_idx def calc_self_hit(self, cn): """Non-normalized value for self hit.""" return cn.shape[0] * self.score_fn(0, 1) - def single_query_target(self, q_idx, t_idx, scores='forward'): + def single_query_target(self, q_idx: int, t_idx: int, scores='forward'): """Query single target against single target.""" # Take a short-cut if this is a self-self comparison if q_idx == t_idx: From f84d02bfcf432c6732fff0080cbd1ecee01ae128 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Mon, 29 Nov 2021 16:25:34 +0000 Subject: [PATCH 10/21] Dotprops.dist_dots type annotation fix --- navis/core/dotprop.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/navis/core/dotprop.py b/navis/core/dotprop.py index c4e5c64e..aaa8175c 100644 --- a/navis/core/dotprop.py +++ b/navis/core/dotprop.py @@ -332,7 +332,9 @@ def dist_dots(self, other: 'Dotprops', alpha: bool = False, distance_upper_bound: Optional[float] = None, - **kwargs) -> Tuple[np.ndarray, np.ndarray]: + **kwargs) -> Union[ + Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray, np.ndarray] + ]: """Query this Dotprops against another. This function is mainly for ``navis.nblast``. From c2c79acd6620366480e4ac8b1113f294b73e4270 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Mon, 29 Nov 2021 16:34:05 +0000 Subject: [PATCH 11/21] small fixes --- navis/nbl/base.py | 4 ++-- navis/nbl/nblast_funcs.py | 4 ++-- navis/nbl/smat.py | 2 +- navis/nbl/synblast_funcs.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/navis/nbl/base.py b/navis/nbl/base.py index 1b238103..9e51344d 100644 --- a/navis/nbl/base.py +++ b/navis/nbl/base.py @@ -25,7 +25,7 @@ FLOAT_DTYPES = {16: np.float16, 32: np.float32, 64: np.float64, None: None} -AppendOutput = Union[int, List['AppendOutput']] +NestedIndices = Union[int, List['NestedIndices']] class Blaster(ABC): @@ -41,7 +41,7 @@ def __init__(self, dtype=np.float64, progress=True): self.ids = [] @abstractmethod - def append(self, neurons) -> AppendOutput: + def append(self, neurons) -> NestedIndices: """Append neurons.""" pass diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index a6b28271..f07986e7 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -26,7 +26,7 @@ from .. import utils, config from ..core import NeuronList, Dotprops, make_dotprops -from .base import Blaster, AppendOutput +from .base import Blaster, NestedIndices __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist'] @@ -160,7 +160,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', else: self.distance_upper_bound = limit_dist - def append(self, dotprops) -> AppendOutput: + def append(self, dotprops) -> NestedIndices: """Append dotprops. Returns the numerical index appended dotprops. diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index f27fd96e..bbb4e5ac 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -36,7 +36,6 @@ epsilon = sys.float_info.epsilon cpu_count = max(1, (os.cpu_count() or 2) - 1) -IMPLICIT_INTERVAL = "[)" fp = Path(__file__).resolve().parent smat_path = fp / "score_mats" @@ -212,6 +211,7 @@ def _query_many(self, idx_pairs, threads=None) -> Iterator[List[np.ndarray]]: if threads is None or threads == 0 and cpu_count == 1: for q_idx, t_idx in idx_pairs: yield self._query(q_idx, t_idx) + return threads = threads or cpu_count idx_pairs = np.asarray(idx_pairs) diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py index 821e617e..2bbb9d54 100644 --- a/navis/nbl/synblast_funcs.py +++ b/navis/nbl/synblast_funcs.py @@ -27,7 +27,7 @@ from .. import config, utils from ..core import NeuronList, BaseNeuron -from .base import Blaster, AppendOutput +from .base import Blaster, NestedIndices from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction, nblast_preflight) @@ -84,7 +84,7 @@ def __init__(self, normalized=True, by_type=True, self.score_fn = ScoringFunction(smat) self.ids = [] - def append(self, neuron, id=None) -> AppendOutput: + def append(self, neuron, id=None) -> NestedIndices: """Append neurons/connector tables, returning numerical indices of added objects""" if isinstance(neuron, pd.DataFrame): return self._append_connectors(neuron, id) From 57c557f5e6c8bb8c27f271e9faeed4674a4a4b2d Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Mon, 29 Nov 2021 17:11:27 +0000 Subject: [PATCH 12/21] smat: write out unclipped bounds --- navis/nbl/smat.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index bbb4e5ac..86a8d836 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -466,14 +466,17 @@ def __init__( self.right = right boundaries = list(boundaries) - self._min = boundaries[0] + self._min = -math.inf if clip[0]: + self._min = boundaries[0] boundaries[0] = -math.inf elif boundaries[0] != -math.inf: + self._min = -math.inf boundaries.insert(0, -math.inf) - self._max = boundaries[-1] + self._max = math.inf if clip[1]: + self._max = boundaries[-1] boundaries[-1] = math.inf elif boundaries[-1] != math.inf: boundaries.append(math.inf) @@ -503,9 +506,12 @@ def to_strings(self) -> List[str]: lb = "[" rb = ")" + b = self.boundaries.copy() + b[0] = self._min + b[-1] = self._max return [ f"{lb}{lower},{upper}{rb}" - for lower, upper in zip(self.boundaries[:-1], self.boundaries[1:]) + for lower, upper in zip(b[:-1], b[1:]) ] @classmethod From 8fbee15e9e403438f86026fe953f7a7b4309308b Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Mon, 29 Nov 2021 17:13:28 +0000 Subject: [PATCH 13/21] incorporate smat.lookup2d into blasting functions --- navis/nbl/nblast_funcs.py | 41 ++++++++++++++++++++++++------------- navis/nbl/synblast_funcs.py | 25 +++++++++++++++------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index f07986e7..e5a7e386 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -16,6 +16,7 @@ import numbers import os +import operator import numpy as np import pandas as pd @@ -24,6 +25,8 @@ from typing import Union, Optional, List from typing_extensions import Literal +from navis.nbl.smat import Lookup2d, smat_fcwb + from .. import utils, config from ..core import NeuronList, Dotprops, make_dotprops from .base import Blaster, NestedIndices @@ -119,10 +122,16 @@ class NBlaster(Blaster): normalized : bool If True, will normalize scores by the best possible score (i.e. self-self) of the query neuron. - smat : str | pd.DataFrame - Score matrix. If 'auto' (default), will use scoring matrices + smat : navis.nbl.smat.Lookup2d | pd.DataFrame | str + How to convert the point match pairs into an NBLAST score, + usually by a lookup table. + If 'auto' (default), will use scoring matrices from FCWB. Same behaviour as in R's nat.nblast - implementation. If ``smat=None`` the scores will be + implementation. + Dataframes will be used to build a ``Lookup2d``. + If ``limit_dist`` is not given, + will attempt to infer from the first axis of the lookup table. + If ``smat=None`` the scores will be generated as the product of the distances and the dotproduct of the vectors of nearest-neighbor pairs. limit_dist : float | "auto" | None @@ -145,20 +154,24 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', self.approx_nn = approx_nn self.desc = "NBlasting" - if smat == 'auto': - if self.use_alpha: - smat = pd.read_csv(f'{smat_path}/smat_alpha_fcwb.csv', - index_col=0) - else: - smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv', - index_col=0) + if smat is None: + self.score_fn = operator.mul + elif smat == 'auto': + self.score_fn = smat_fcwb(self.use_alpha) + elif isinstance(smat, pd.DataFrame): + self.score_fn = Lookup2d.from_dataframe(smat) + else: + self.score_fn = smat - self.score_fn = ScoringFunction(smat) + self.distance_upper_bound = None - if limit_dist == 'auto': - self.distance_upper_bound = self.score_fn.max_dist - else: + if limit_dist is not None: self.distance_upper_bound = limit_dist + else: + try: + self.distance_upper_bound = self.score_fn.digitizers[0]._max + except AttributeError: + pass def append(self, dotprops) -> NestedIndices: """Append dotprops. diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py index 2bbb9d54..eec20c1c 100644 --- a/navis/nbl/synblast_funcs.py +++ b/navis/nbl/synblast_funcs.py @@ -15,6 +15,7 @@ """Module contains functions implementing SyNBLAST.""" import os +import operator import numpy as np import pandas as pd @@ -28,9 +29,10 @@ from ..core import NeuronList, BaseNeuron from .base import Blaster, NestedIndices +from .smat import Lookup2d from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction, - nblast_preflight) + nblast_preflight, smat_fcwb) __all__ = ['synblast'] @@ -59,10 +61,14 @@ class SynBlaster(Blaster): by_type : bool If True will only compare synapses with the same value in the "type" column. - smat : str | pd.DataFrame - Score matrix. If 'auto' (default), will use scoring matrices + smat : navis.nbl.smat.Lookup2d | pd.DataFrame | str + How to convert the point match pairs into an NBLAST score, + usually by a lookup table. + If 'auto' (default), will use scoring matrices from FCWB. Same behaviour as in R's nat.nblast - implementation. If ``smat=None`` the scores will be + implementation. + Dataframes will be used to build a ``Lookup2d``. + If ``smat=None`` the scores will be generated as the product of the distances and the dotproduct of the vectors of nearest-neighbor pairs. progress : bool @@ -77,9 +83,14 @@ def __init__(self, normalized=True, by_type=True, self.normalized = normalized self.by_type = by_type - if smat == 'auto': - smat = pd.read_csv(f'{smat_path}/smat_fcwb.csv', - index_col=0) + if smat is None: + self.score_fn = operator.mul + elif smat == 'auto': + self.score_fn = smat_fcwb() + elif isinstance(smat, pd.DataFrame): + self.score_fn = Lookup2d.from_dataframe(smat) + else: + self.score_fn = smat self.score_fn = ScoringFunction(smat) self.ids = [] From a7f53495484a54d11d1c642d1f1333fed5714c84 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 30 Nov 2021 14:15:04 +0000 Subject: [PATCH 14/21] Lookup2d docs --- navis/nbl/smat.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 86a8d836..63982a51 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -381,7 +381,9 @@ def __init__( seed: int = DEFAULT_SEED, ): f"""Class for building a 2-dimensional score lookup for NBLAST. + The scores are + 1. The distances between best-matching points 2. The dot products of direction vectors around those points, optionally scaled by the colinearity ``alpha``. @@ -658,9 +660,33 @@ class Lookup2d(LookupNd): """ def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarray): + """2D lookup table for convert NBLAST matches to scores. + + Commonly read from a ``pandas.DataFrame`` + or trained on data using a ``LookupDistDotBuilder``. + + Parameters + ---------- + digitizer0 : Digitizer + How to convert continuous values into an index for the first axis. + digitizer1 : Digitizer + How to convert continuous values into an index for the second axis. + cells : np.ndarray + Values to look up in the table. + """ super().__init__([digitizer0, digitizer1], cells) def to_dataframe(self) -> pd.DataFrame: + """Convert the lookup table into a ``pandas.DataFrame``. + + From there, it can be shared, saved, and so on. + + The index and column labels describe the intervals represented by that axis. + + Returns + ------- + pd.DataFrame + """ return pd.DataFrame( self.cells, self.digitizers[0].to_strings(), From bf00b55d17a7a4cb407cb2ac7ca80652bc622823 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 30 Nov 2021 14:15:39 +0000 Subject: [PATCH 15/21] Smat building tutorial --- docs/source/api.rst | 9 + docs/source/gallery.rst | 1 + docs/source/tutorials/smat.ipynb | 491 +++++++++++++++++++++++++++++++ 3 files changed, 501 insertions(+) create mode 100644 docs/source/tutorials/smat.ipynb diff --git a/docs/source/api.rst b/docs/source/api.rst index 4161dfbe..d82b4cc6 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -364,6 +364,15 @@ Functions to compare morphology. navis.synblast navis.persistence_distances +Utilities for creating your own score matrices for NBLAST can be found in + +.. autosummary:: + :toctree: generated/ + + navis.nbl.smat.Lookup2d + navis.nbl.smat.Digitizer + navis.nbl.smat.LookupDistDotBuilder + Polarity metrics ---------------- .. autosummary:: diff --git a/docs/source/gallery.rst b/docs/source/gallery.rst index 62a61f4b..4da347ad 100644 --- a/docs/source/gallery.rst +++ b/docs/source/gallery.rst @@ -72,6 +72,7 @@ function. There you will also find more examples. tutorials/nblast tutorials/nblast_flycircuit tutorials/nblast_hemibrain + tutorials/smat .. raw:: html diff --git a/docs/source/tutorials/smat.ipynb b/docs/source/tutorials/smat.ipynb new file mode 100644 index 00000000..45672bb2 --- /dev/null +++ b/docs/source/tutorials/smat.ipynb @@ -0,0 +1,491 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ".. _smat_intro:\n", + "\n", + "NBLAST score matrix generation\n", + "******************************\n", + "\n", + "NBLAST calculates the similarity between neurons' morphology.\n", + "For more information on NBLAST in ``navis``, see the other tutorials.\n", + "\n", + "The core of the algorithm is the function which converts point matches\n", + "(a distance and a dot product) into a score\n", + "expressing how likely they are to have come from the same point cloud.\n", + "This is generally a 2D lookup table, referred to as the score matrix,\n", + "generated by \"training\" it on neurons known to be matching or non-matching.\n", + "\n", + "``navis`` provides (and uses by default) the score matrix used in the original publication (Costa et al., 2016).\n", + "This works quite well in many cases.\n", + "However, how appropriate it is for your data depends on a number of factors:\n", + "\n", + "* How big your neurons are (commonly addressed by scaling the distance axis of the built-in score matrix)\n", + "* How you have pre-processed your neurons (pruning dendrites, resampling etc.)\n", + "* Your actual task (matching left-right pairs, finding lineages etc.)\n", + "* How distinct you expect your matches and non-matches to be (e.g. how large a body volume you're drawing neurons from)\n", + "\n", + "Utilities in ``navis.nbl.smat`` allow you to train your own score matrix." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import navis\n", + "from navis.nbl.smat import Lookup2d, LookupDistDotBuilder, Digitizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``Lookup2d`` is the lookup table, and can be given to any nblast-related class or function.\n", + "Each axis represented by a ``Digitizer``, which converts continuous values into discrete indices.\n", + "These indices are used to look up score values in an array.\n", + "\n", + "The ``LookupDistDotBuilder`` is a class which generates ``Lookup2d`` instances from training data.\n", + "\n", + "First, we need some training data.\n", + "Let's augment our set of example neurons by randomly mutating them:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# use a replicable random number generator\n", + "RNG = np.random.default_rng(2021)\n", + "\n", + "\n", + "def augment_neuron(\n", + " nrn: navis.TreeNeuron, scale_sigma=0.1, translation_sigma=50, jitter_sigma=10\n", + "):\n", + " nrn = nrn.copy(deepcopy=True)\n", + " nrn.name += \"_aug\"\n", + " dims = list(\"xyz\")\n", + " coords = nrn.nodes[dims].to_numpy()\n", + "\n", + " # translate whole neuron\n", + " coords += RNG.normal(scale=translation_sigma, size=coords.shape[-1])\n", + " # jitter individual coordinates\n", + " coords += RNG.normal(scale=jitter_sigma, size=coords.shape)\n", + " # rescale\n", + " mean = np.mean(coords, axis=0)\n", + " coords -= mean\n", + " coords *= RNG.normal(loc=1.0, scale=scale_sigma)\n", + " coords += mean\n", + "\n", + " nrn.nodes[dims] = coords\n", + " return nrn\n", + "\n", + "\n", + "original = list(navis.example_neurons())\n", + "jittered = [augment_neuron(n) for n in original]\n", + "\n", + "dotprops = [navis.make_dotprops(n, k=5, resample=False) for n in original + jittered]\n", + "matching_pairs = [[idx, idx + len(original)] for idx in range(len(original))]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The score matrix builder needs some neurons as a list of ``navis.Dotprops`` objects,\n", + "and then to know which neurons should match with each other as indices into that list.\n", + "It's assumed that matches are relatively rare among the total set of possible pairings,\n", + "so non-matching pairs are drawn randomly (although a non-matching list can be given explicitly).\n", + "\n", + "Then it needs to know where to draw the boundaries between bins in the output lookup table.\n", + "These can be given explicitly as a list of 2 ``Digitizer`` s,\n", + "or can be inferred from the data: bins will be drawn to evenly partition the matching neuron scores.\n", + "\n", + "The resulting ``Lookup2d`` can be imported/exported as a ``pandas.DataFrame`` for ease of viewing and storing." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
[0.0,0.14588644700379452)[0.14588644700379452,0.29912252023057806)[0.29912252023057806,0.4802508950780032)[0.4802508950780032,0.7351365037506351)[0.7351365037506351,0.9988406500803395)
[2.0140850483155233,57.8493661406481)0.4866610.3790420.3872600.4456480.855775
[57.8493661406481,81.31283353333698)0.4047660.3437020.4064450.4178020.799595
[81.31283353333698,104.08576202392578)0.2579710.3118270.2176260.2487431.013926
[104.08576202392578,128.14104591262304)0.2557280.0896630.1715990.1159971.296510
[128.14104591262304,155.36119651794434)-0.136171-0.107249-0.1257510.2528831.987175
[155.36119651794434,202.6728515625)-0.575078-0.448307-0.475147-0.2210161.407061
[202.6728515625,395.9569088293945)-1.025938-0.948679-0.863801-0.6205120.054148
[395.9569088293945,4709.61474609375)-0.615558-0.737251-0.679764-0.454779-0.197384
\n", + "
" + ], + "text/plain": [ + " [0.0,0.14588644700379452) \\\n", + "[2.0140850483155233,57.8493661406481) 0.486661 \n", + "[57.8493661406481,81.31283353333698) 0.404766 \n", + "[81.31283353333698,104.08576202392578) 0.257971 \n", + "[104.08576202392578,128.14104591262304) 0.255728 \n", + "[128.14104591262304,155.36119651794434) -0.136171 \n", + "[155.36119651794434,202.6728515625) -0.575078 \n", + "[202.6728515625,395.9569088293945) -1.025938 \n", + "[395.9569088293945,4709.61474609375) -0.615558 \n", + "\n", + " [0.14588644700379452,0.29912252023057806) \\\n", + "[2.0140850483155233,57.8493661406481) 0.379042 \n", + "[57.8493661406481,81.31283353333698) 0.343702 \n", + "[81.31283353333698,104.08576202392578) 0.311827 \n", + "[104.08576202392578,128.14104591262304) 0.089663 \n", + "[128.14104591262304,155.36119651794434) -0.107249 \n", + "[155.36119651794434,202.6728515625) -0.448307 \n", + "[202.6728515625,395.9569088293945) -0.948679 \n", + "[395.9569088293945,4709.61474609375) -0.737251 \n", + "\n", + " [0.29912252023057806,0.4802508950780032) \\\n", + "[2.0140850483155233,57.8493661406481) 0.387260 \n", + "[57.8493661406481,81.31283353333698) 0.406445 \n", + "[81.31283353333698,104.08576202392578) 0.217626 \n", + "[104.08576202392578,128.14104591262304) 0.171599 \n", + "[128.14104591262304,155.36119651794434) -0.125751 \n", + "[155.36119651794434,202.6728515625) -0.475147 \n", + "[202.6728515625,395.9569088293945) -0.863801 \n", + "[395.9569088293945,4709.61474609375) -0.679764 \n", + "\n", + " [0.4802508950780032,0.7351365037506351) \\\n", + "[2.0140850483155233,57.8493661406481) 0.445648 \n", + "[57.8493661406481,81.31283353333698) 0.417802 \n", + "[81.31283353333698,104.08576202392578) 0.248743 \n", + "[104.08576202392578,128.14104591262304) 0.115997 \n", + "[128.14104591262304,155.36119651794434) 0.252883 \n", + "[155.36119651794434,202.6728515625) -0.221016 \n", + "[202.6728515625,395.9569088293945) -0.620512 \n", + "[395.9569088293945,4709.61474609375) -0.454779 \n", + "\n", + " [0.7351365037506351,0.9988406500803395) \n", + "[2.0140850483155233,57.8493661406481) 0.855775 \n", + "[57.8493661406481,81.31283353333698) 0.799595 \n", + "[81.31283353333698,104.08576202392578) 1.013926 \n", + "[104.08576202392578,128.14104591262304) 1.296510 \n", + "[128.14104591262304,155.36119651794434) 1.987175 \n", + "[155.36119651794434,202.6728515625) 1.407061 \n", + "[202.6728515625,395.9569088293945) 0.054148 \n", + "[395.9569088293945,4709.61474609375) -0.197384 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "builder = LookupDistDotBuilder(\n", + " dotprops, matching_pairs, use_alpha=True, seed=2021\n", + ").with_bin_counts([8, 5])\n", + "smat = builder.build()\n", + "as_table = smat.to_dataframe()\n", + "as_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we can have this score matrix, we can use it for a problem which can be solved by NBLAST:\n", + "we've mixed up a bag of neurons which look very similar to some of our examples,\n", + "and need to know which they match with." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "NBLAST is optimized for data in microns and it looks like your queries are not in microns.\n", + "NBLAST is optimized for data in microns and it looks like your targets are not in microns.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b55a73a0832e495596df888b4e7d879d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Preparing: 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
754538881_aug722817260_aug1734350908_aug1734350788_aug754534424_aug
1734350788-0.271504-0.361381-0.2720180.159048-0.325920
1734350908-0.400432-0.4913760.858478-0.437819-0.216206
722817260-0.1574330.127931-0.365195-0.407436-0.393940
754534424-0.237769-0.413532-0.193794-0.4011480.857898
754538881-0.021153-0.213349-0.200214-0.303390-0.115017
\n", + "" + ], + "text/plain": [ + " 754538881_aug 722817260_aug 1734350908_aug 1734350788_aug \\\n", + "1734350788 -0.271504 -0.361381 -0.272018 0.159048 \n", + "1734350908 -0.400432 -0.491376 0.858478 -0.437819 \n", + "722817260 -0.157433 0.127931 -0.365195 -0.407436 \n", + "754534424 -0.237769 -0.413532 -0.193794 -0.401148 \n", + "754538881 -0.021153 -0.213349 -0.200214 -0.303390 \n", + "\n", + " 754534424_aug \n", + "1734350788 -0.325920 \n", + "1734350908 -0.216206 \n", + "722817260 -0.393940 \n", + "754534424 0.857898 \n", + "754538881 -0.115017 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "original_dps = dotprops[:len(original)]\n", + "new_dps = [\n", + " navis.make_dotprops(\n", + " augment_neuron(n),\n", + " k=5,\n", + " resample=False\n", + " )\n", + " for n in original\n", + "]\n", + "RNG.shuffle(new_dps)\n", + "\n", + "result = navis.nblast(\n", + " original_dps, new_dps,\n", + " use_alpha=True, scores=\"mean\", normalized=True,\n", + " smat=smat,\n", + " n_cores=1,\n", + ")\n", + "result.index = [dp.name for dp in original_dps]\n", + "result.columns = [dp.name for dp in new_dps]\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see that while there aren't any particularly good scores\n", + "(this is a very small amount of training data, and one would normally preprocess the neurons),\n", + "in each case the original's best match is its augmented partner." + ] + } + ], + "metadata": { + "interpreter": { + "hash": "97618690babd5eee2c893391179393f7dcc498027dfffda6cb7b8d10b95474fd" + }, + "kernelspec": { + "display_name": "Python 3.9.4 64-bit ('navis': pyenv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 60c26dd4960d516a28ed0d9da12d2d2870a804ff Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 30 Nov 2021 14:19:46 +0000 Subject: [PATCH 16/21] fix lru cache --- navis/nbl/smat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 63982a51..2c6f07e6 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -201,7 +201,7 @@ def _empty_counts(self) -> np.ndarray: shape = [len(b) for b in self.digitizers] return np.zeros(shape, int) - @lru_cache + @lru_cache(None) def _query(self, q_idx, t_idx) -> List[np.ndarray]: """Get the results of applying the match function to dotprops specified by indices""" return self.match_fn(self.dotprops[q_idx], self.dotprops[t_idx]) From 9246cfbdb9e57719a3ce064cd003757f1e45960f Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 30 Nov 2021 14:27:17 +0000 Subject: [PATCH 17/21] Remove ScoreFunction references --- navis/nbl/nblast_funcs.py | 67 ------------------------------------- navis/nbl/synblast_funcs.py | 3 +- 2 files changed, 1 insertion(+), 69 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index e5a7e386..f1f5463e 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -39,73 +39,6 @@ logger = config.logger -class ScoringFunction: - """Class representing scoring function.""" - - def __init__(self, smat): - if isinstance(smat, type(None)): - self.scoring_function = self.pass_through - elif isinstance(smat, (pd.DataFrame, str)): - self.parse_matrix(smat) - self.scoring_function = self.score_lookup - else: - raise TypeError - - def __call__(self, dist, dot): - return self.scoring_function(dist, dot) - - @property - def max_dist(self): - """Max distance considered by the scoring matrix. - - Returns ``None`` if pass through. - """ - if self.scoring_function == self.pass_through: - return None - # The last bin is always `np.inf`, so we need the second last bin - return self.dist_bins[-2] - - def pass_through(self, dist, dot): - """Pass-through scores if no scoring matrix.""" - return dist * dot - - def score_lookup(self, dist, dot): - return self.cells[ - np.digitize(dist, self.dist_bins), - np.digitize(dot, self.dot_bins), - ] - - def parse_matrix(self, smat): - """Parse matrix.""" - if isinstance(smat, str): - smat = pd.read_csv(smat, index_col=0) - - if not isinstance(smat, pd.DataFrame): - raise TypeError(f'Excepted filepath or DataFrame, got "{type(smat)}"') - - self.cells = smat.to_numpy() - - self.dist_thresholds = [self.parse_interval(s) for s in smat.index] - # Make sure right bin is open - self.dist_thresholds[-1] = np.inf - self.dist_bins = np.array(self.dist_thresholds, float) - - self.dot_thresholds = [self.parse_interval(s) for s in smat.columns] - # Make sure right bin is open - self.dot_thresholds[-1] = np.inf - self.dot_bins = np.array(self.dot_thresholds, float) - - def parse_interval(self, s): - """Strip brackets and parse right interval. - - Example - ------- - >>> parse_intervals("(0,0.1]") # doctest: +SKIP - 0.1 - """ - return float(s.strip("([])").split(",")[-1]) - - class NBlaster(Blaster): """Implements version 2 of the NBLAST algorithm. diff --git a/navis/nbl/synblast_funcs.py b/navis/nbl/synblast_funcs.py index eec20c1c..433039ea 100644 --- a/navis/nbl/synblast_funcs.py +++ b/navis/nbl/synblast_funcs.py @@ -31,7 +31,7 @@ from .base import Blaster, NestedIndices from .smat import Lookup2d -from .nblast_funcs import (check_microns, find_optimal_partition, ScoringFunction, +from .nblast_funcs import (check_microns, find_optimal_partition, nblast_preflight, smat_fcwb) __all__ = ['synblast'] @@ -92,7 +92,6 @@ def __init__(self, normalized=True, by_type=True, else: self.score_fn = smat - self.score_fn = ScoringFunction(smat) self.ids = [] def append(self, neuron, id=None) -> NestedIndices: From 62a1daa5d76de8420bd1c6ed9fe9fa6db969712c Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 30 Nov 2021 14:49:04 +0000 Subject: [PATCH 18/21] fix limit_dist test --- navis/nbl/nblast_funcs.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index f1f5463e..8fc24384 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -96,15 +96,14 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', else: self.score_fn = smat - self.distance_upper_bound = None - - if limit_dist is not None: - self.distance_upper_bound = limit_dist - else: + if limit_dist == "auto": try: self.distance_upper_bound = self.score_fn.digitizers[0]._max except AttributeError: - pass + logger.warning("Could not infer distance upper bound from scoring function") + self.distance_upper_bound = None + else: + self.distance_upper_bound = limit_dist def append(self, dotprops) -> NestedIndices: """Append dotprops. From e3ccc4d937117186f8774a9cae59221aa37d33ca Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Tue, 7 Dec 2021 11:12:15 +0000 Subject: [PATCH 19/21] Add SimpleLookup for non-float cases --- navis/nbl/nblast_funcs.py | 2 +- navis/nbl/smat.py | 85 +++++++++++++++++++++++++++++++------ tests/test_nbl/test_smat.py | 2 +- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index 8fc24384..9bf1ffe1 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -98,7 +98,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', if limit_dist == "auto": try: - self.distance_upper_bound = self.score_fn.digitizers[0]._max + self.distance_upper_bound = self.score_fn.axes[0]._max except AttributeError: logger.warning("Could not infer distance upper bound from scoring function") self.distance_upper_bound = None diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 2c6f07e6..8aa9ee8c 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -1,10 +1,12 @@ from __future__ import annotations +from abc import ABC, abstractmethod from itertools import permutations import sys import os from collections import Counter from concurrent.futures import ProcessPoolExecutor from typing import ( + Generic, Hashable, Iterator, Mapping, @@ -15,6 +17,7 @@ Iterable, Any, Tuple, + TypeVar, Union, ) import logging @@ -443,14 +446,70 @@ def parse_boundary(item: str): return tuple(float(i) for i in item[1:-1].split(",")), right -class Digitizer: +T = TypeVar("T") + + +class LookupAxis(ABC, Generic[T]): + """Class converting some data into a linear index.""" + + @abstractmethod + def __len__(self) -> int: + """Number of bins represented by this instance.""" + pass + + @abstractmethod + def __call__(self, value: Union[T, Sequence[T]]) -> Union[int, Sequence[int]]: + """Convert some data into a linear index. + + Parameters + ---------- + value : Union[T, Sequence[T]] + Value to convert into an index + + Returns + ------- + Union[int, Sequence[int]] + If a scalar was given, return a scalar; otherwise, a numpy array of ints. + """ + pass + + +class SimpleLookup(LookupAxis[Hashable]): + def __init__(self, items: List[Hashable]): + """Look up in a list of items and return their index. + + Parameters + ---------- + items : List[Hashable] + The item's position in the list is the index which will be returned. + + Raises + ------ + ValueError + items are non-unique. + """ + self.items = {item: idx for idx, item in enumerate(items)} + if len(self.items) != len(items): + raise ValueError("Items are not unique") + + def __len__(self) -> int: + return len(self.items) + + def __call__(self, value: Union[Hashable, Sequence[Hashable]]) -> Union[int, Sequence[int]]: + if np.isscalar(value): + return self.items[value] + else: + return np.array([self.items[v] for v in value], int) + + +class Digitizer(LookupAxis[float]): def __init__( self, boundaries: Sequence[float], clip: Tuple[bool, bool] = (True, True), right=False, ): - """Class converting continuous values into discrete indices given specific bin boundaries. + """Class converting continuous values into discrete indices. Parameters ---------- @@ -637,29 +696,29 @@ def __eq__(self, other: object) -> bool: class LookupNd: - def __init__(self, digitizers: List[Digitizer], cells: np.ndarray): - if [len(b) for b in digitizers] != list(cells.shape): + def __init__(self, axes: List[LookupAxis], cells: np.ndarray): + if [len(b) for b in axes] != list(cells.shape): raise ValueError("boundaries and cells have inconsistent bin counts") - self.digitizers = digitizers + self.axes = axes self.cells = cells def __call__(self, *args): - if len(args) != len(self.digitizers): + if len(args) != len(self.axes): raise TypeError( - f"Lookup takes {len(self.digitizers)} arguments but {len(args)} were given" + f"Lookup takes {len(self.axes)} arguments but {len(args)} were given" ) - idxs = tuple(d(arg) for d, arg in zip(self.digitizers, args)) + idxs = tuple(d(arg) for d, arg in zip(self.axes, args)) out = self.cells[idxs] return out class Lookup2d(LookupNd): - """Convenience class inheriting from LookupNd for the common 2D case. + """Convenience class inheriting from LookupNd for the common 2D float case. Provides IO with pandas DataFrames. """ - def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarray): + def __init__(self, axis0: Digitizer, axis1: Digitizer, cells: np.ndarray): """2D lookup table for convert NBLAST matches to scores. Commonly read from a ``pandas.DataFrame`` @@ -674,7 +733,7 @@ def __init__(self, digitizer0: Digitizer, digitizer1: Digitizer, cells: np.ndarr cells : np.ndarray Values to look up in the table. """ - super().__init__([digitizer0, digitizer1], cells) + super().__init__([axis0, axis1], cells) def to_dataframe(self) -> pd.DataFrame: """Convert the lookup table into a ``pandas.DataFrame``. @@ -689,8 +748,8 @@ def to_dataframe(self) -> pd.DataFrame: """ return pd.DataFrame( self.cells, - self.digitizers[0].to_strings(), - self.digitizers[1].to_strings(), + self.axes[0].to_strings(), + self.axes[1].to_strings(), ) @classmethod diff --git a/tests/test_nbl/test_smat.py b/tests/test_nbl/test_smat.py index 76d7f9fb..04d2d4f6 100644 --- a/tests/test_nbl/test_smat.py +++ b/tests/test_nbl/test_smat.py @@ -87,7 +87,7 @@ def test_lookup2d_roundtrip(): df = lookup.to_dataframe() lookup2 = Lookup2d.from_dataframe(df) assert np.allclose(lookup.cells, lookup2.cells) - for b1, b2 in zip(lookup.digitizers, lookup2.digitizers): + for b1, b2 in zip(lookup.axes, lookup2.axes): assert b1 == b2 From 62b80943f38fdd236443e9d0ce457e4e818eff74 Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Wed, 9 Feb 2022 13:10:42 +0000 Subject: [PATCH 20/21] WIP units --- navis/config.py | 5 +- navis/nbl/nblast_funcs.py | 29 +++++++++--- navis/nbl/smat.py | 83 ++++++++++++++++++++++++++++----- navis/units.py | 97 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 194 insertions(+), 20 deletions(-) create mode 100644 navis/units.py diff --git a/navis/config.py b/navis/config.py index dd380d17..8b802906 100644 --- a/navis/config.py +++ b/navis/config.py @@ -17,6 +17,8 @@ import matplotlib as mpl +from .units import ureg # noqa: F401 + logger = logging.getLogger('navis') @@ -69,9 +71,6 @@ def remove_log_handlers(): # Default color for neurons default_color = (.95, .65, .04) -# Unit registry -ureg = pint.UnitRegistry() - # Set to true to prevent Viewer from ever showing headless = os.environ.get('NAVIS_HEADLESS', 'False').lower() == 'true' if headless: diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index 9bf1ffe1..11cde829 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -22,14 +22,18 @@ import pandas as pd from concurrent.futures import ProcessPoolExecutor -from typing import Union, Optional, List +from typing import Union, Optional from typing_extensions import Literal +import pint + +from pint.quantity import Quantity from navis.nbl.smat import Lookup2d, smat_fcwb from .. import utils, config from ..core import NeuronList, Dotprops, make_dotprops from .base import Blaster, NestedIndices +from ..units import as_unit __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist'] @@ -105,7 +109,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', else: self.distance_upper_bound = limit_dist - def append(self, dotprops) -> NestedIndices: + def append(self, dotprops, ignore_units=False) -> NestedIndices: """Append dotprops. Returns the numerical index appended dotprops. @@ -113,15 +117,28 @@ def append(self, dotprops) -> NestedIndices: return a (possibly nested) list of indices. """ if isinstance(dotprops, Dotprops): - return self._append_dotprops(dotprops) + return self._append_dotprops(dotprops, ignore_units) try: - return [self.append(n) for n in dotprops] + return [self.append(n, ignore_units) for n in dotprops] except TypeError: # i.e. not iterable raise ValueError(f"Expected Dotprops or iterable thereof; got {type(dotprops)}") - def _append_dotprops(self, dotprops: Dotprops) -> int: - next_id = len(self) + def _append_dotprops(self, dotprops: Dotprops, ignore_units) -> int: + if not ignore_units: + # if isinstance(dotprops.units, pint.Quantity): + # if np.allclose(1, dotprops.units): + # units = dotprops.units.units + # else: + # logger.warning( + # "Dotprops coordinates are not unitary (%s). " + # "This might lead to unexpected results.", + # dotprops.units + # ) + # elif dotprops: + # units = as_unit(dotprops.units) + + # if as_unit(dotprops.units) self.neurons.append(dotprops) self.ids.append(dotprops.id) # Calculate score for self hit diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index 8aa9ee8c..a18fc8ed 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -28,14 +28,18 @@ import math from collections import defaultdict +import pint import numpy as np import pandas as pd +from ..config import ureg from ..core.neurons import Dotprops +from ..units import parse_quantity, reduce_units, as_unit, DIMENSIONLESS logger = logging.getLogger(__name__) DEFAULT_SEED = 1991 +DIMENSIONLESS = pint.Unit("dimensionless") epsilon = sys.float_info.epsilon cpu_count = max(1, (os.cpu_count() or 2) - 1) @@ -433,7 +437,23 @@ def is_monotonically_increasing(lst): return True -def parse_boundary(item: str): +def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]: + """Parse a string representing a half-open interval. + + Parameters + ---------- + item : str + An interval formatted like ``[1 nm, inf nm)`` + + Returns + ------- + Tuple[Tuple[float, float], bool, pint.Unit] + ( + (lower_bound, upper_bound), + include_right, + units + ) + """ explicit_interval = item[0] + item[-1] if explicit_interval == "[)": right = False @@ -443,7 +463,18 @@ def parse_boundary(item: str): raise ValueError( f"Enclosing characters '{explicit_interval}' do not match a half-open interval" ) - return tuple(float(i) for i in item[1:-1].split(",")), right + + out = [] + units = [] + for s in item[1:-1].split(","): + q = parse_quantity(s) + out.append(q.magnitude) + units.append(q.units) + + if len(out) != 2: + raise ValueError(f"Could not parse interval, got {len(out)} values instead of 2") + + return tuple(out), right, reduce_units(units) T = TypeVar("T") @@ -452,6 +483,8 @@ def parse_boundary(item: str): class LookupAxis(ABC, Generic[T]): """Class converting some data into a linear index.""" + units: pint.Unit = DIMENSIONLESS + @abstractmethod def __len__(self) -> int: """Number of bins represented by this instance.""" @@ -473,6 +506,13 @@ def __call__(self, value: Union[T, Sequence[T]]) -> Union[int, Sequence[int]]: """ pass + def same_units(self, unit: Union[str, pint.Unit]) -> bool: + try: + reduce_units([self.units, as_unit(unit)]) + except ValueError: + return False + return True + class SimpleLookup(LookupAxis[Hashable]): def __init__(self, items: List[Hashable]): @@ -508,6 +548,8 @@ def __init__( boundaries: Sequence[float], clip: Tuple[bool, bool] = (True, True), right=False, + *, + units=DIMENSIONLESS, ): """Class converting continuous values into discrete indices. @@ -546,6 +588,7 @@ def __init__( raise ValueError("Boundaries are not monotonically increasing") self.boundaries = np.asarray(boundaries) + self.units = as_unit(units) def __len__(self): return len(self.boundaries) - 1 @@ -570,8 +613,14 @@ def to_strings(self) -> List[str]: b = self.boundaries.copy() b[0] = self._min b[-1] = self._max + + if self.units == DIMENSIONLESS: + unit = "" + else: + unit = " " + str(self.units) + return [ - f"{lb}{lower},{upper}{rb}" + f"{lb}{lower}{unit},{upper}{unit}{rb}" for lower, upper in zip(b[:-1], b[1:]) ] @@ -596,8 +645,10 @@ def from_strings(cls, interval_strs: Sequence[str]): bounds: List[float] = [] last_upper = None last_right = None + units = [] for item in interval_strs: - (lower, upper), right = parse_boundary(item) + (lower, upper), right, unit = parse_interval(item) + units.append(unit) bounds.append(float(lower)) if last_right is not None: @@ -613,10 +664,10 @@ def from_strings(cls, interval_strs: Sequence[str]): last_upper = upper bounds.append(float(last_upper)) - return cls(bounds, right=last_right) + return cls(bounds, right=last_right, units=reduce_units(units)) @classmethod - def from_linear(cls, lower: float, upper: float, nbins: int, right=False): + def from_linear(cls, lower: float, upper: float, nbins: int, right=False, *, units=DIMENSIONLESS): """Choose digitizer boundaries spaced linearly between two values. Input values will be clipped to fit within the given interval. @@ -638,10 +689,10 @@ def from_linear(cls, lower: float, upper: float, nbins: int, right=False): Digitizer """ arr = np.linspace(lower, upper, nbins + 1, endpoint=True) - return cls(arr, right=right) + return cls(arr, right=right, units=units) @classmethod - def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False): + def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False, *, units=DIMENSIONLESS): """Choose digitizer boundaries in a geometric sequence. Additional bins will be added above and below the given values. @@ -664,7 +715,7 @@ def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right= Digitizer """ arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True) - return cls(arr, clip=(False, False), right=right) + return cls(arr, clip=(False, False), right=right, units=units) @classmethod def from_data(cls, data: Sequence[float], nbins: int, right=False): @@ -684,8 +735,13 @@ def from_data(cls, data: Sequence[float], nbins: int, right=False): ------- Digitizer """ + if isinstance(data, pint.Quantity): + data = data.magnitude + units = data.units + else: + units = DIMENSIONLESS arr = np.quantile(data, np.linspace(0, 1, nbins + 1, True)) - return cls(arr, right=right) + return cls(arr, right=right, units=units) def __eq__(self, other: object) -> bool: if not isinstance(other, Digitizer): @@ -712,6 +768,10 @@ def __call__(self, *args): out = self.cells[idxs] return out + @property + def units(self): + return tuple(ax.units for ax in self.axes) + class Lookup2d(LookupNd): """Convenience class inheriting from LookupNd for the common 2D float case. @@ -776,7 +836,8 @@ def _smat_fcwb(alpha=False): fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha] fpath = smat_path / fname - return Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0)) + lookup = Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0)) + lookup.axes[0].units = ureg.Unit("micrometer") def smat_fcwb(alpha=False): diff --git a/navis/units.py b/navis/units.py new file mode 100644 index 00000000..85a39535 --- /dev/null +++ b/navis/units.py @@ -0,0 +1,97 @@ +import re +from typing import Optional, Union, Sequence + +import numpy as np +import pint + +DIMENSIONLESS = pint.Unit("dimensionless") + +INF_RE = re.compile(r"(?P-?)inf(inity)?") + +ureg = pint.UnitRegistry() +ureg.define("@alias micron = micrometer") + + +def parse_quantity(item: str) -> pint.Quantity: + """Parse strings into ``pint.Quantity``, accounting for infinity. + + Parameters + ---------- + item : str + A quantity string like those used by ``pint``. + + Returns + ------- + pint.Quantity + """ + item = item.strip() + try: + q = ureg.Quantity(item) + except pint.UndefinedUnitError as e: + first, *other = item.split() + m = INF_RE.match(first) + if m is None: + raise e + + val = float("inf") + if m.groupdict()["is_neg"]: + val *= -1 + unit = ureg.Unit(" ".join(other)) + q = ureg.Quantity(val, unit) + + return q + + +def as_unit(unit: Optional[Union[str, pint.Unit]]) -> pint.Unit: + """Convert a string (or None) into a ``pint.Unit`` + + Parameters + ---------- + unit : Optional[Union[str, pint.Unit]] + + Returns + ------- + pint.Unit + If the ``unit`` argument was ``None``, return dimensionless. + """ + if unit is None: + return DIMENSIONLESS + + if isinstance(unit, pint.Unit): + return unit + + return ureg.Unit(unit) + + +def reduce_units(units: Sequence[Optional[Union[str, pint.Unit]]]) -> pint.Unit: + """Reduce a sequence of units or unit-like strings down to a single ``pint.Unit``. + + Dimensionless units are ignored. + + Parameters + ---------- + units : Sequence[Optional[Union[str, pint.Unit]]] + ``None`` is treated as dimensionless. + + Returns + ------- + pint.Unit + Consensus units of the sequence. + + Raises + ------ + ValueError + If more than one non-dimensionless unit is found. + """ + # use np.unique instead of set operations here, + # because setting aliases in the registry affects + # __eq__ (comparisons as used by np.unique) but not + # __hash__ (as used by sets) + unit_set = np.unique([DIMENSIONLESS] + [as_unit(u1) for u1 in units]) + if len(unit_set) == 1: + return DIMENSIONLESS + actuals = list(unit_set) + actuals.remove(DIMENSIONLESS) + if len(actuals) == 1: + return actuals[0] + raise ValueError(f"More than one real unit found: {sorted(unit_set)}") From dfc6b4bf686e087cddbc397f5ab86c288220c9ca Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Mon, 11 Apr 2022 13:39:58 +0100 Subject: [PATCH 21/21] Revert "WIP units" This reverts commit 62b80943f38fdd236443e9d0ce457e4e818eff74. --- navis/config.py | 5 +- navis/nbl/nblast_funcs.py | 29 +++--------- navis/nbl/smat.py | 83 +++++---------------------------- navis/units.py | 97 --------------------------------------- 4 files changed, 20 insertions(+), 194 deletions(-) delete mode 100644 navis/units.py diff --git a/navis/config.py b/navis/config.py index 8b802906..dd380d17 100644 --- a/navis/config.py +++ b/navis/config.py @@ -17,8 +17,6 @@ import matplotlib as mpl -from .units import ureg # noqa: F401 - logger = logging.getLogger('navis') @@ -71,6 +69,9 @@ def remove_log_handlers(): # Default color for neurons default_color = (.95, .65, .04) +# Unit registry +ureg = pint.UnitRegistry() + # Set to true to prevent Viewer from ever showing headless = os.environ.get('NAVIS_HEADLESS', 'False').lower() == 'true' if headless: diff --git a/navis/nbl/nblast_funcs.py b/navis/nbl/nblast_funcs.py index 11cde829..9bf1ffe1 100644 --- a/navis/nbl/nblast_funcs.py +++ b/navis/nbl/nblast_funcs.py @@ -22,18 +22,14 @@ import pandas as pd from concurrent.futures import ProcessPoolExecutor -from typing import Union, Optional +from typing import Union, Optional, List from typing_extensions import Literal -import pint - -from pint.quantity import Quantity from navis.nbl.smat import Lookup2d, smat_fcwb from .. import utils, config from ..core import NeuronList, Dotprops, make_dotprops from .base import Blaster, NestedIndices -from ..units import as_unit __all__ = ['nblast', 'nblast_smart', 'nblast_allbyall', 'sim_to_dist'] @@ -109,7 +105,7 @@ def __init__(self, use_alpha=False, normalized=True, smat='auto', else: self.distance_upper_bound = limit_dist - def append(self, dotprops, ignore_units=False) -> NestedIndices: + def append(self, dotprops) -> NestedIndices: """Append dotprops. Returns the numerical index appended dotprops. @@ -117,28 +113,15 @@ def append(self, dotprops, ignore_units=False) -> NestedIndices: return a (possibly nested) list of indices. """ if isinstance(dotprops, Dotprops): - return self._append_dotprops(dotprops, ignore_units) + return self._append_dotprops(dotprops) try: - return [self.append(n, ignore_units) for n in dotprops] + return [self.append(n) for n in dotprops] except TypeError: # i.e. not iterable raise ValueError(f"Expected Dotprops or iterable thereof; got {type(dotprops)}") - def _append_dotprops(self, dotprops: Dotprops, ignore_units) -> int: - if not ignore_units: - # if isinstance(dotprops.units, pint.Quantity): - # if np.allclose(1, dotprops.units): - # units = dotprops.units.units - # else: - # logger.warning( - # "Dotprops coordinates are not unitary (%s). " - # "This might lead to unexpected results.", - # dotprops.units - # ) - # elif dotprops: - # units = as_unit(dotprops.units) - - # if as_unit(dotprops.units) + def _append_dotprops(self, dotprops: Dotprops) -> int: + next_id = len(self) self.neurons.append(dotprops) self.ids.append(dotprops.id) # Calculate score for self hit diff --git a/navis/nbl/smat.py b/navis/nbl/smat.py index a18fc8ed..8aa9ee8c 100644 --- a/navis/nbl/smat.py +++ b/navis/nbl/smat.py @@ -28,18 +28,14 @@ import math from collections import defaultdict -import pint import numpy as np import pandas as pd -from ..config import ureg from ..core.neurons import Dotprops -from ..units import parse_quantity, reduce_units, as_unit, DIMENSIONLESS logger = logging.getLogger(__name__) DEFAULT_SEED = 1991 -DIMENSIONLESS = pint.Unit("dimensionless") epsilon = sys.float_info.epsilon cpu_count = max(1, (os.cpu_count() or 2) - 1) @@ -437,23 +433,7 @@ def is_monotonically_increasing(lst): return True -def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]: - """Parse a string representing a half-open interval. - - Parameters - ---------- - item : str - An interval formatted like ``[1 nm, inf nm)`` - - Returns - ------- - Tuple[Tuple[float, float], bool, pint.Unit] - ( - (lower_bound, upper_bound), - include_right, - units - ) - """ +def parse_boundary(item: str): explicit_interval = item[0] + item[-1] if explicit_interval == "[)": right = False @@ -463,18 +443,7 @@ def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]: raise ValueError( f"Enclosing characters '{explicit_interval}' do not match a half-open interval" ) - - out = [] - units = [] - for s in item[1:-1].split(","): - q = parse_quantity(s) - out.append(q.magnitude) - units.append(q.units) - - if len(out) != 2: - raise ValueError(f"Could not parse interval, got {len(out)} values instead of 2") - - return tuple(out), right, reduce_units(units) + return tuple(float(i) for i in item[1:-1].split(",")), right T = TypeVar("T") @@ -483,8 +452,6 @@ def parse_interval(item: str) -> Tuple[Tuple[float, float], bool, pint.Unit]: class LookupAxis(ABC, Generic[T]): """Class converting some data into a linear index.""" - units: pint.Unit = DIMENSIONLESS - @abstractmethod def __len__(self) -> int: """Number of bins represented by this instance.""" @@ -506,13 +473,6 @@ def __call__(self, value: Union[T, Sequence[T]]) -> Union[int, Sequence[int]]: """ pass - def same_units(self, unit: Union[str, pint.Unit]) -> bool: - try: - reduce_units([self.units, as_unit(unit)]) - except ValueError: - return False - return True - class SimpleLookup(LookupAxis[Hashable]): def __init__(self, items: List[Hashable]): @@ -548,8 +508,6 @@ def __init__( boundaries: Sequence[float], clip: Tuple[bool, bool] = (True, True), right=False, - *, - units=DIMENSIONLESS, ): """Class converting continuous values into discrete indices. @@ -588,7 +546,6 @@ def __init__( raise ValueError("Boundaries are not monotonically increasing") self.boundaries = np.asarray(boundaries) - self.units = as_unit(units) def __len__(self): return len(self.boundaries) - 1 @@ -613,14 +570,8 @@ def to_strings(self) -> List[str]: b = self.boundaries.copy() b[0] = self._min b[-1] = self._max - - if self.units == DIMENSIONLESS: - unit = "" - else: - unit = " " + str(self.units) - return [ - f"{lb}{lower}{unit},{upper}{unit}{rb}" + f"{lb}{lower},{upper}{rb}" for lower, upper in zip(b[:-1], b[1:]) ] @@ -645,10 +596,8 @@ def from_strings(cls, interval_strs: Sequence[str]): bounds: List[float] = [] last_upper = None last_right = None - units = [] for item in interval_strs: - (lower, upper), right, unit = parse_interval(item) - units.append(unit) + (lower, upper), right = parse_boundary(item) bounds.append(float(lower)) if last_right is not None: @@ -664,10 +613,10 @@ def from_strings(cls, interval_strs: Sequence[str]): last_upper = upper bounds.append(float(last_upper)) - return cls(bounds, right=last_right, units=reduce_units(units)) + return cls(bounds, right=last_right) @classmethod - def from_linear(cls, lower: float, upper: float, nbins: int, right=False, *, units=DIMENSIONLESS): + def from_linear(cls, lower: float, upper: float, nbins: int, right=False): """Choose digitizer boundaries spaced linearly between two values. Input values will be clipped to fit within the given interval. @@ -689,10 +638,10 @@ def from_linear(cls, lower: float, upper: float, nbins: int, right=False, *, uni Digitizer """ arr = np.linspace(lower, upper, nbins + 1, endpoint=True) - return cls(arr, right=right, units=units) + return cls(arr, right=right) @classmethod - def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False, *, units=DIMENSIONLESS): + def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right=False): """Choose digitizer boundaries in a geometric sequence. Additional bins will be added above and below the given values. @@ -715,7 +664,7 @@ def from_geom(cls, lowest_upper: float, highest_lower: float, nbins: int, right= Digitizer """ arr = np.geomspace(lowest_upper, highest_lower, nbins - 1, True) - return cls(arr, clip=(False, False), right=right, units=units) + return cls(arr, clip=(False, False), right=right) @classmethod def from_data(cls, data: Sequence[float], nbins: int, right=False): @@ -735,13 +684,8 @@ def from_data(cls, data: Sequence[float], nbins: int, right=False): ------- Digitizer """ - if isinstance(data, pint.Quantity): - data = data.magnitude - units = data.units - else: - units = DIMENSIONLESS arr = np.quantile(data, np.linspace(0, 1, nbins + 1, True)) - return cls(arr, right=right, units=units) + return cls(arr, right=right) def __eq__(self, other: object) -> bool: if not isinstance(other, Digitizer): @@ -768,10 +712,6 @@ def __call__(self, *args): out = self.cells[idxs] return out - @property - def units(self): - return tuple(ax.units for ax in self.axes) - class Lookup2d(LookupNd): """Convenience class inheriting from LookupNd for the common 2D float case. @@ -836,8 +776,7 @@ def _smat_fcwb(alpha=False): fname = ("smat_fcwb.csv", "smat_alpha_fcwb.csv")[alpha] fpath = smat_path / fname - lookup = Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0)) - lookup.axes[0].units = ureg.Unit("micrometer") + return Lookup2d.from_dataframe(pd.read_csv(fpath, index_col=0)) def smat_fcwb(alpha=False): diff --git a/navis/units.py b/navis/units.py deleted file mode 100644 index 85a39535..00000000 --- a/navis/units.py +++ /dev/null @@ -1,97 +0,0 @@ -import re -from typing import Optional, Union, Sequence - -import numpy as np -import pint - -DIMENSIONLESS = pint.Unit("dimensionless") - -INF_RE = re.compile(r"(?P-?)inf(inity)?") - -ureg = pint.UnitRegistry() -ureg.define("@alias micron = micrometer") - - -def parse_quantity(item: str) -> pint.Quantity: - """Parse strings into ``pint.Quantity``, accounting for infinity. - - Parameters - ---------- - item : str - A quantity string like those used by ``pint``. - - Returns - ------- - pint.Quantity - """ - item = item.strip() - try: - q = ureg.Quantity(item) - except pint.UndefinedUnitError as e: - first, *other = item.split() - m = INF_RE.match(first) - if m is None: - raise e - - val = float("inf") - if m.groupdict()["is_neg"]: - val *= -1 - unit = ureg.Unit(" ".join(other)) - q = ureg.Quantity(val, unit) - - return q - - -def as_unit(unit: Optional[Union[str, pint.Unit]]) -> pint.Unit: - """Convert a string (or None) into a ``pint.Unit`` - - Parameters - ---------- - unit : Optional[Union[str, pint.Unit]] - - Returns - ------- - pint.Unit - If the ``unit`` argument was ``None``, return dimensionless. - """ - if unit is None: - return DIMENSIONLESS - - if isinstance(unit, pint.Unit): - return unit - - return ureg.Unit(unit) - - -def reduce_units(units: Sequence[Optional[Union[str, pint.Unit]]]) -> pint.Unit: - """Reduce a sequence of units or unit-like strings down to a single ``pint.Unit``. - - Dimensionless units are ignored. - - Parameters - ---------- - units : Sequence[Optional[Union[str, pint.Unit]]] - ``None`` is treated as dimensionless. - - Returns - ------- - pint.Unit - Consensus units of the sequence. - - Raises - ------ - ValueError - If more than one non-dimensionless unit is found. - """ - # use np.unique instead of set operations here, - # because setting aliases in the registry affects - # __eq__ (comparisons as used by np.unique) but not - # __hash__ (as used by sets) - unit_set = np.unique([DIMENSIONLESS] + [as_unit(u1) for u1 in units]) - if len(unit_set) == 1: - return DIMENSIONLESS - actuals = list(unit_set) - actuals.remove(DIMENSIONLESS) - if len(actuals) == 1: - return actuals[0] - raise ValueError(f"More than one real unit found: {sorted(unit_set)}")