diff --git a/anndata/__init__.py b/anndata/__init__.py index f7e0b39f9..02d6b1121 100644 --- a/anndata/__init__.py +++ b/anndata/__init__.py @@ -1,5 +1,6 @@ from ._core.anndata import AnnData, ImplicitModificationWarning from ._core.raw import Raw +from ._core.ref_path import RefPath from ._io import ( read_h5ad, read_loom, @@ -71,7 +72,6 @@ AnnData.write_loom AnnData.write_zarr - Errors and warnings ------------------- diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index f7c1964c5..b1c407cc2 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -230,6 +230,11 @@ def __init__( if vals is not None: self.update(vals) + def __setitem__(self, key: str, value: V): + if hasattr(value, "index") and isinstance(value.index, pd.RangeIndex): + value.index = self.dim_names + super().__setitem__(key, value) + class AxisArraysView(AlignedViewMixin, AxisArraysBase): def __init__( diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index fa93592d7..783ff46b7 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -42,6 +42,7 @@ _resolve_idxs, ) from .sparse_dataset import SparseDataset +from . import ref_path from .. import utils from ..utils import convert_to_dict, ensure_df_homogeneous from ..logging import anndata_logger as logger @@ -275,8 +276,18 @@ def __init__( obs: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, var: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, uns: Optional[Mapping[str, Any]] = None, - obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, - varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, + obsm: Optional[ + Union[ + np.ndarray, + Mapping[str, Union[np.ndarray, sparse.spmatrix, pd.DataFrame]], + ] + ] = None, + varm: Optional[ + Union[ + np.ndarray, + Mapping[str, Union[np.ndarray, sparse.spmatrix, pd.DataFrame]], + ] + ] = None, layers: Optional[Mapping[str, Union[np.ndarray, sparse.spmatrix]]] = None, raw: Optional[Mapping[str, Any]] = None, dtype: Union[np.dtype, str] = "float32", @@ -285,8 +296,12 @@ def __init__( filemode: Optional[Literal["r", "r+"]] = None, asview: bool = False, *, - obsp: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, - varp: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, + obsp: Optional[ + Union[np.ndarray, Mapping[str, Union[np.ndarray, sparse.spmatrix]]] + ] = None, + varp: Optional[ + Union[np.ndarray, Mapping[str, Union[np.ndarray, sparse.spmatrix]]] + ] = None, oidx: Index1D = None, vidx: Index1D = None, ): @@ -1334,6 +1349,11 @@ def _get_X(self, use_raw=False, layer=None): else: return self.X + # TODO: export when exporting RefPath. + # resolve_path = ref_path.resolve_path + get_df = ref_path.get_df + get_vector = ref_path.get_vector + def obs_vector(self, k: str, *, layer: Optional[str] = None) -> np.ndarray: """\ Convenience function for returning a 1 dimensional ndarray of values @@ -1364,7 +1384,7 @@ def obs_vector(self, k: str, *, layer: Optional[str] = None) -> np.ndarray: FutureWarning, ) layer = None - return get_vector(self, k, "obs", "var", layer=layer) + return get_vector(self, k, "var", layer=layer) def var_vector(self, k, *, layer: Optional[str] = None) -> np.ndarray: """\ @@ -1396,7 +1416,7 @@ def var_vector(self, k, *, layer: Optional[str] = None) -> np.ndarray: FutureWarning, ) layer = None - return get_vector(self, k, "var", "obs", layer=layer) + return get_vector(self, k, "obs", layer=layer) @utils.deprecated("obs_vector") def _get_obs_array(self, k, use_raw=False, layer=None): diff --git a/anndata/_core/index.py b/anndata/_core/index.py index c98dcaea0..182ec7c6a 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -7,6 +7,9 @@ import pandas as pd from scipy.sparse import spmatrix, issparse +from ..compat import Literal +from . import anndata, raw + Index1D = Union[slice, int, str, np.int64, np.ndarray] Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] @@ -138,29 +141,47 @@ def make_slice(idx, dimidx, n=2): return tuple(mut) -def get_vector(adata, k, coldim, idxdim, layer=None): +def find_vector( + adata: Union["anndata.AnnData", "raw.Raw"], k: str, idxdim: Literal["obs", "var"] +) -> bool: + coldim = "obs" if idxdim == "var" else "var" # adata could be self if Raw and AnnData shared a parent - dims = ("obs", "var") col = getattr(adata, coldim).columns idx = getattr(adata, f"{idxdim}_names") - in_col = k in col in_idx = k in idx - - if (in_col + in_idx) == 2: + if in_col and in_idx: raise ValueError( f"Key {k} could be found in both .{idxdim}_names and .{coldim}.columns" ) - elif (in_col + in_idx) == 0: + elif not in_col and not in_idx: raise KeyError( f"Could not find key {k} in .{idxdim}_names or .{coldim}.columns." ) - elif in_col: + return in_col + + +def get_vector( + adata: Union["anndata.AnnData", "raw.Raw"], + k: str, + idxdim: Literal["obs", "var"], + layer: Optional[str] = None, +): + in_col = find_vector(adata, k, idxdim) + if in_col: + coldim = "obs" if idxdim == "var" else "var" return getattr(adata, coldim)[k].values - elif in_idx: - selected_dim = dims.index(idxdim) - idx = adata._normalize_indices(make_slice(k, selected_dim)) - a = adata._get_X(layer=layer)[idx] - if issparse(a): - a = a.toarray() - return np.ravel(a) + else: + return get_x_vector(adata, idxdim, k, layer) + + +def get_x_vector( + adata: Union["anndata.AnnData", "raw.Raw"], + idxdim: Literal["obs", "var"], + k: str, + layer: str = None, +): + selected_dim = ("obs", "var").index(idxdim) + idx = adata._normalize_indices(make_slice(k, selected_dim)) + a = adata._get_X(layer=layer)[idx] + return np.ravel(a.toarray() if issparse(a) else a) diff --git a/anndata/_core/raw.py b/anndata/_core/raw.py index 5abdca2e6..a6dcb0d2d 100644 --- a/anndata/_core/raw.py +++ b/anndata/_core/raw.py @@ -97,6 +97,10 @@ def var_names(self): def obs_names(self): return self._adata.obs_names + @property + def layers(self): + return {None: self.X} + def __getitem__(self, index): oidx, vidx = self._normalize_indices(index) @@ -163,7 +167,7 @@ def _normalize_indices(self, packed_index): def var_vector(self, k: str) -> np.ndarray: # TODO decorator to copy AnnData.var_vector docstring - return get_vector(self, k, "var", "obs") + return get_vector(self, k, "obs") def obs_vector(self, k: str) -> np.ndarray: # TODO decorator to copy AnnData.obs_vector docstring diff --git a/anndata/_core/ref_path.py b/anndata/_core/ref_path.py new file mode 100644 index 000000000..0626ff730 --- /dev/null +++ b/anndata/_core/ref_path.py @@ -0,0 +1,458 @@ +from enum import Enum +from itertools import product +from typing import Union, Optional # Special +from typing import Iterable, Sequence, Generator # ABCs +from typing import Dict, Tuple # Classes + +import pandas as pd +import numpy as np +import scipy.sparse as ssp + +from ..compat import Literal +from . import anndata, index +from .raw import Raw + + +# TODO: allow sequences: ("obsm", "X_pca", [0, 1]) +from ..utils import _doc_params + + +class AttrInfo(Enum): + layers = ((str, ("obs", "var"), str), dict(l=(), X=("X",))) + obs = ((str,), dict(o=())) + obsm = ((str, (int, str)), dict(om=())) + obsp = ((str, str, (0, 1)), dict(op=())) + var = ((str,), dict(v=())) + varm = ((str, (int, str)), dict(vm=())) + varp = ((str, str, (0, 1)), dict(vp=())) + raw = ((("X", "var", "varm"), ...), dict(rX=("X",), rv=("var",), rvm=("varm",))) + + def __repr__(self): + return f"{self.__class__.__name__}.{self.name}" + + @property + def validation(self): + return self.value[0] + + @property + def short_codes(self): + return self.value[1] + + def validate(self, path: Sequence[Union[str, int]]): + if self is not AttrInfo.raw and not len(path) == len(self.validation): + raise ValueError( + f"Path length mismatch: required ({len(self.validation)}) ≠ " + f"got ({len(path)}) in path {path!r} for attr {self.name}." + ) + elif self is AttrInfo.raw: + sub_path = RefPath.parse(path) + # layers or obs + sub_path._attr.validate(sub_path.path) + return + + for i, (elem, check) in enumerate(zip(path, self.validation)): + err_prefix = f"Element path[{i}]={path[i]!r} in path {path!r}" + if isinstance(check, tuple): + if isinstance(check[0], type): + if not any(isinstance(elem, c) for c in check): + check_str = ", ".join(typ.__name__ for typ in check) + raise ValueError( + f"{err_prefix} is not of one of the types {{{check_str}}}." + ) + elif not any(elem == c for c in check): + check_str = ", ".join(map(repr, check)) + raise ValueError(f"{err_prefix} is not one of {{{check_str}}}.") + elif isinstance(check, type): + if not isinstance(elem, check): + raise ValueError(f"{err_prefix} is not of type {check}.") + else: + assert False, f"Unhandled check {check!r} for {self}." + + @staticmethod + def prefix(prefix: str) -> Tuple[str, Tuple[str, ...]]: + if prefix in AttrInfo.__members__: # name + return prefix, () + for attr in AttrInfo: + path_prefix = attr.short_codes.get(prefix) + if path_prefix is not None: + return attr.name, path_prefix + raise ValueError(f"Unknown attr name or short code {prefix!r}.") + + +SHORTCUTS = ", ".join( + f"`{attr.name}` ({', '.join(f'`{c}`' for c in attr.short_codes)})" + for attr in AttrInfo +) + + +class RefPath: + """\ + A fully resolved path referring to a vector in an :attr:`~anndata.AnnData` object. + The vector will have the length of the `AnnData`’s :attr:`dim`, + :attr:`~anndata.AnnData.n_obs` or :attr:`~anndata.AnnData.n_vars`. + + Depending on :attr:`attr`, :attr:`path` can be: + + layers + `(name: str, dim: "obs"|"var", _name: str)` + + e.g. `RefPath("X", "var", "Actb")` + obs, var + `(column: str,)` + obsm, varm + `(name: str, column: int|str)` + obsp, varp + `(name: str, _name: str, axis: 0|1)` + raw + `(attr: "X"|"var"|"varm", ...)` + + :attr:`path` is a subpath into :attr:`~anndata.AnnData.raw`. + See also :attr:`raw_subpath`. + """ + + _attr: AttrInfo + _path: Tuple[Union[str, int], ...] + + def __init__(self, attr: str, *path: Union[str, int]): + self._attr = AttrInfo[attr] + self._path = path + self._attr.validate(path) + + @staticmethod + @_doc_params(shortcuts=SHORTCUTS) + def parse(full_path: Union["RefPath", str, Sequence[Union[str, int]]]) -> "RefPath": + """\ + Converts tuples or strings with path specifications to `RefPath`\\ s. + + Parameters + ---------- + full_path + Attribute, followed by path (see :class:`~anndata.RefPath`). + Either a string containing a `'/'`-delimited path, or a tuple. + Both can contain shortcuts for the attribute: + + {shortcuts} + """ + if isinstance(full_path, RefPath): + return full_path + from_str = isinstance(full_path, str) + if from_str: + # TODO: don’t split off gene names with slashes + full_path = full_path.split("/") + + if not full_path: + raise ValueError(f"No path specified.") + attr_or_code, *path = full_path + # path can be shorthand: "obs/Foo" and "o/Foo" + attr, path_prefix = AttrInfo.prefix(attr_or_code) + if from_str and attr in {"obsp", "varp"}: + if path[-1] not in "01": + raise ValueError(f"Invalid last segment of {attr} path: {path[-1]!r}.") + path[-1] = int(path[-1]) + return RefPath(attr, *path_prefix, *path) + + @property + def attr(self) -> str: + """Name of the referred to attribute in an :class:`~anndata.AnnData` object.""" + return self._attr.name + + @property + def path(self) -> Tuple[Union[str, int], ...]: + """Path to a vector in :attr:`attr`. See :class:`~anndata.RefPath`.""" + return self._path + + @property + def dim(self) -> Literal["obs", "var"]: + """\ + Dimension this path refers to (`obs` or `var`). + + Returns e.g. `var` for an :attr:`attr` of `var`, `varm`, `varp`, + and `obs` for `RefPath("layers", "var", "GeneName")`, + as a vector for a certain `var` has :attr:`~anndata.AnnData.n_obs` entries. + + .. note:: + Returns `var` for `RefPath("raw", "var", "ColName")`, + but note that :attr:`~anndata.AnnData.raw` can have a higher + :attr:`~anndata.AnnData.n_vars` than its parent :class:`~anndata.AnnData`. + """ + if self._attr in {AttrInfo.obs, AttrInfo.obsm, AttrInfo.obsp}: + return "obs" + if self._attr in {AttrInfo.var, AttrInfo.varm, AttrInfo.varp}: + return "var" + if self._attr is AttrInfo.layers: + idx_dim = self.path[1] + return "obs" if idx_dim == "var" else "var" + if self._attr is AttrInfo.raw: + return self.raw_subpath.dim + assert False, f"Unimplemented attr {self._attr}" + + @property + def raw_subpath(self) -> "RefPath": + """Returns a sub-path that resolves within a :attr:`~anndata.AnnData.raw`.""" + try: + assert self._attr is AttrInfo.raw + raw_attr, *raw_path = self.path + return RefPath.parse((raw_attr, *raw_path)) + except (AssertionError, ValueError): + raise AttributeError( + f"{self.__class__.__name__} with attr={self.attr} " + "has no `raw_subpath`" + ) + + def __repr__(self): + return f"RefPath({self.attr!r}, {', '.join(map(repr, self.path))})" + + def __eq__(self, other: "RefPath"): + if not isinstance(other, RefPath): + try: + return self == RefPath.parse(other) + except ValueError: + return False + return self._attr == other._attr and self.path == other.path + + # TODO: make public? + def _make_name(self, length: int = 1) -> str: + path = (self.attr, *self.path) + if length == 1 and self._attr in {AttrInfo.obsp, AttrInfo.varp}: + return path[-2] # just key + if length in {1, 2} and self.attr in {AttrInfo.obsm, AttrInfo.varm}: + if isinstance(path[-1], int): + return f"{path[-2]}{path[-1] + 1}" # X_pca1 + else: # normal + return path[-1] if length == 1 else f"{path[-2]}-{path[-1]}" + if length <= 2: + return "-".join(path[-length:]) + return f"{path[:-2]}{self._make_name(length=2)}" + + def get_vector(self, adata: Union["anndata.AnnData", Raw]): + """Returns the referred-to vector from an :class:`~anndata.AnnData` object.""" + attr = getattr(adata, self.attr) + if self._attr is AttrInfo.layers: # X is here after normalizing + layer_name, dim, key = self.path + layer_name = None if layer_name == "X" else layer_name + return index.get_x_vector(adata, dim, key, layer_name) + if self._attr is AttrInfo.obs or self._attr is AttrInfo.var: + (col,) = self.path + return _to_vector(attr[col]) + assert not isinstance(adata, Raw) + if self._attr is AttrInfo.obsm or self._attr is AttrInfo.varm: + m_name, col = self.path + m = attr[m_name] + return _to_vector(m[col] if isinstance(m, pd.DataFrame) else m[:, col]) + if self._attr is AttrInfo.obsp or self._attr is AttrInfo.varp: + p_name, key, orient = self.path + p = attr[p_name] + idx = getattr(adata, f"{attr.dim}_names") == key + return _to_vector(p[:, idx] if orient == 1 else p[idx, :]) + if self._attr is AttrInfo.raw: + return self.raw_subpath.get_vector(adata.raw) + else: + assert False, f"Unhandled attr {self.attr!r}" + + +def _to_vector(v: Union[ssp.spmatrix, np.ndarray, pd.Series]) -> np.ndarray: + v = v.toarray() if ssp.issparse(v) else v + v = v.values if isinstance(v, pd.Series) else v + return np.ravel(v) + + +# AnnData methods + + +RefPathLike = Union[str, Tuple[Union[str, int], ...], RefPath] +PARAMS_RESOLVE = """\ +adata + Annotated data object. +path + This supports subpaths of the :class:`~anndata.RefPath` syntax. + `str` keys or tuple subpaths (like `'GeneA'` or `('X_pca', 0)`) are resolved + according to `dim`, `use_raw`, and `alias_col`. + As `RefPath`\\ s are always unique, they get passed through. +dim + Dimension to resolve paths in. + If `dim=None`, both dimensions are tried and an error is thrown for duplicates. + If e.g. `dim='obs'`, it would find an unique name in + :attr:`~anndata.AnnData.obs_names` or :attr:`~anndata.AnnData.obs`\\ `.columns`. +use_raw + Resolve partial paths for `X`, `var`, or `varm` in `adata.raw` + instead of `adata`? +alias_col + A column in `adata.` with gene names to use instead of `adata._names` + (autodetected if `dim=None`) +layer + The layer to get the vector from if the path resolves to a `_name`.\ +""" + + +@_doc_params(params_resolve=PARAMS_RESOLVE) +def resolve_path( + adata: "anndata.AnnData", + *path: Union[str, RefPath, int], + dim: Optional[Literal["obs", "var"]] = None, + use_raw: bool = False, + alias_col: Optional[str] = None, + layer: Optional[str] = None, +) -> RefPath: + """\ + Resolves a :class:`~anndata.RefPath`-like :class:`tuple` or :class:`str` key. + + Parameters + ---------- + {params_resolve} + """ + try: + rp = RefPath.parse(path) + except ValueError: + pass + else: + if rp.attr == "layers": + raise ValueError("Cannot parse layer path containing a dim, use `dim=...`") + return rp + if use_raw: + adata = adata.raw + if dim not in ["obs", "var", None]: + raise ValueError(f"`dim` needs to be 'obs' or 'var', not {dim!r}.") + dims = ["obs", "var"] if dim is None else [dim] + key, *rest = path[0].split("/", 1) if len(path) == 1 else path + if rest and layer is None and key in adata.layers: + layer, key, *rest = key, *rest + + # single string or layer, search in .{obs,var}{_names,.columns} + if not rest: + for idxdim in dims: + try: + in_col = index.find_vector(adata, key, idxdim) + break + except KeyError: + pass # Ignore when not found + else: # Did not find it anywhere + dims_str = " or ".join( + f"adata.{d}_names/.{'obs' if d == 'var' else 'var'}.columns" + for d in dims + ) + raise KeyError(f"Key {key} not found in {dims_str}") + # TODO + # if alias_col is not None: + # idx = getattr(adata, dim)[alias_col] + # key = idx.index[idx == key] + if in_col: + coldim = "obs" if idxdim == "var" else "var" + return RefPath(coldim, key) + else: + layer = "X" if layer is None else layer + return RefPath("layers", layer, idxdim, key) + + # search in obsm, varm + if len(rest) == 1: + sub_key = rest[0] + for idxdim in dims: + m = getattr(adata, f"{idxdim}m") + if key in m: + arr_df = m[key] + break + else: + raise KeyError(f"Path {key!r}/{sub_key!r} not found in `adata`") + sub_key = _normalize_arr_df_column( + arr_df, + f"Column {sub_key!r} not found in adata.{idxdim}m[{key!r}]", + sub_key, + len(path) == 1 and "/" in path[0], + ) + return RefPath(f"{idxdim}m", key, sub_key) + + raise KeyError(f"Path {path!r} not found in `adata`") + + +def _normalize_arr_df_column(arr_df, err_str: str, sub_key: str, is_str_path: bool): + if hasattr(arr_df, "columns") and sub_key in arr_df.columns: + return sub_key + if not is_str_path: + raise KeyError(err_str) + try: + col_idx = int(sub_key) + except ValueError: + raise KeyError(err_str) + if col_idx < arr_df.shape[1]: + return col_idx + else: + raise IndexError(f"{err_str}, it has only {arr_df.shape[1]} columns") + + +@_doc_params(params_resolve=PARAMS_RESOLVE) +def get_vector( + adata: "anndata.AnnData", + *path: Union[str, RefPath, int], + dim: Optional[Literal["obs", "var"]] = None, + use_raw: bool = False, + alias_col: Optional[str] = None, + layer: Optional[str] = None, +) -> np.ndarray: + """\ + Get a single 1D vector using the `path`. + + Parameters + ---------- + {params_resolve} + """ + return resolve_path(**locals()).get_vector(adata) + + +@_doc_params(params_resolve=PARAMS_RESOLVE) +def get_df( + adata: "anndata.AnnData", + paths: Iterable[RefPathLike], + *, + dim: Optional[Literal["obs", "var"]] = None, + use_raw: bool = False, + alias_col: Optional[str] = None, + layer: Optional[str] = None, +) -> pd.DataFrame: + """\ + Resolves multiple paths, gets vectors via :meth:`~anndata.AnnData.get_vector` and + joins them to a :class:`pandas.DataFrame`. + + So becomes `("obs", ["A", "B"])` the paths `("obs", "A")` and `("obs", "B")`. + The data frame column names are unique and as short as possible. + + Parameters + ---------- + {params_resolve} + """ + kwargs = locals() + del kwargs["paths"], kwargs["adata"] + paths = [resolve_path(adata, p, **kwargs) for p in split_paths(paths)] + names = paths_to_names(paths) + columns = {n: p.get_vector(adata) for n, p in names.items()} + return pd.DataFrame(columns, adata.obs_names) + + +def split_paths( + multipath: Union[RefPathLike, Iterable[RefPathLike]] +) -> Generator[RefPathLike, None, None]: + if isinstance(multipath, RefPath): + yield multipath # validated, so no inner sequence! + elif isinstance(multipath, str): + # TODO: globs and stuff. probably needs resolving info + yield multipath + elif isinstance(multipath, tuple): + yield from product( + *([elem] if isinstance(elem, (str, int)) else elem for elem in multipath) + ) + else: # iterable + for mp in multipath: + yield from split_paths(mp) + + +def paths_to_names(paths: Sequence[RefPath], length: int = 1) -> Dict[str, RefPath]: + names = {} + dupes = {} + for path, name in zip(paths, (p._make_name(length) for p in paths)): + dupes.setdefault(name, []).append(path) + for name, paths_dup in dupes.items(): + if len(paths_dup) == 1: + names[name] = paths_dup[0] + elif any(len(p) > length for p in paths_dup): + names.update(paths_to_names(paths_dup, length + 1)) + else: + raise ValueError(f"Not sure how {name} can be extended for {paths_dup}") + return names diff --git a/anndata/tests/test_ref_path.py b/anndata/tests/test_ref_path.py new file mode 100644 index 000000000..43a83b1cd --- /dev/null +++ b/anndata/tests/test_ref_path.py @@ -0,0 +1,147 @@ +import pytest +import numpy as np +import pandas as pd +import scipy.sparse as ssp + +from anndata import AnnData +from anndata._core.ref_path import RefPath, split_paths, resolve_path + + +@pytest.fixture +def adata(): + return AnnData( + X=np.array([[0, 1, 2, 3], [4, 5, 6, 7]]), + layers=dict(unspliced=np.array([[0.1, 1.2, 2.3, 3.4], [4.5, 5.6, 6.7, 7.8]])), + obs=dict(obs_names=["Cell1", "Cell2"], group=["batch1", "batch2"]), + obsm=dict( + X_pca=np.array([[0.2, 0.4, 0.6], [0.1, 0.3, 0.5]]), + protein=pd.DataFrame(dict(CD14=[2.1, 3.2])), + ), + obsp=dict(neighbors_distances=ssp.csr_matrix([[0, 0.4], [0.3, 0]])), + var=dict(var_names=[f"Gene{i}" for i in "WXYZ"], mito=[0.4, 0.3, 0.2, 0.1]), + # varm=dict(...), + varp=dict(cors=np.array([(5, 4, 3, 2)] * 4)), + raw=dict( + X=np.array([[0, 11, 22, 33, 88], [44, 55, 66, 77, 99]]), + var=dict( + var_names=[f"Gene{i}" for i in "WXYZA"], + symbol=[f"Symb{i}" for i in range(1, 6)], + ), + ), + ) + + +paths = [ + (("layers", "unspliced", "obs", "Cell1"), "var", [0.1, 1.2, 2.3, 3.4]), + (("obs", "group"), "obs", ["batch1", "batch2"]), + (("obsm", "X_pca", 1), "obs", [0.4, 0.3]), + (("obsm", "protein", "CD14"), "obs", [2.1, 3.2]), + (("obsp", "neighbors_distances", "Cell2", 0), "obs", [0.3, 0]), + (("var", "mito"), "var", [0.4, 0.3, 0.2, 0.1]), + # (("varm", "", ""), "var", []), + (("varp", "cors", "GeneY", 1), "var", [3] * 4), + (("raw", "X", "var", "GeneA"), "obs", [88, 99]), + # TODO: is var correct here? It’ll return more variables … + (("raw", "var", "symbol"), "var", ["Symb1", "Symb2", "Symb3", "Symb4", "Symb5"]), +] + + +@pytest.mark.parametrize("args,dim,expected", paths, ids=repr) +def test_dim(args, dim, expected): + assert RefPath(*args).dim == dim + + +@pytest.mark.parametrize("args,dim,expected", paths, ids=repr) +def test_get_vector(args, dim, expected, adata): + rp = RefPath(*args) + vec = rp.get_vector(adata) + assert isinstance(vec, np.ndarray) + assert len(vec.shape) == 1 + assert vec.tolist() == expected + + +@pytest.mark.parametrize( + "rp_code", ["RefPath('obs', 'group')", "RefPath('varp', 'neighbors', 'Cell5', 1)"] +) +def test_repr(rp_code): + rp = eval(rp_code) + assert repr(rp) == rp_code + + +@pytest.mark.parametrize( + "spec, resolved", + [ + (("X", "var", "GeneX"), ("layers", "X", "var", "GeneX")), + ("op/foo/bar/0", ("obsp", "foo", "bar", 0)), + (("rX", "var", "GeneA"), ("raw", "X", "var", "GeneA")), + ], + ids=repr, +) +def test_parse(spec, resolved): + assert RefPath.parse(spec) == RefPath(*resolved) + + +@pytest.mark.parametrize( + "spec,err_regex", + [ + (("raw",), r"No path specified\."), + (("obsm", "X_pca", None), r"None in path \('X_pca', None\).*\{int, str\}\."), + (("layers", "X", 1, "G1"), r"path\[1\]=1.*not one of \{'obs', 'var'\}\."), + ("X", r"required \(3\) ≠ got \(1\) in path \('X',\) for attr layers\."), + ("f/XY", r"Unknown attr name or short code 'f'\."), + ("op/foo/bar/notAnInt", r"Invalid.*obsp path: 'notAnInt'\."), + ], + ids=repr, +) +def test_parse_failures(spec, err_regex): + with pytest.raises(ValueError, match=err_regex): + RefPath.parse(spec) + + +# anndata part (resolving paths) + + +def test_alias(): + pass # TODO + + +@pytest.mark.parametrize( + "multipath,reference", + [ + ([("obs", "A"), ("X", "var", "G1")], [("obs", "A"), ("X", "var", "G1")]), + (("obs", ["A", "B"]), [("obs", "A"), ("obs", "B")]), + ( + (["obs", "var"], ["A", "B"]), + [("obs", "A"), ("obs", "B"), ("var", "A"), ("var", "B")], + ), + ( + ["G1", ("rX", "var", ["G4", "G5"])], + ["G1", ("rX", "var", "G4"), ("rX", "var", "G5")], + ), + ], + ids=repr, +) +def test_split_paths(multipath, reference): + got = list(split_paths(multipath)) + assert got == reference + + +@pytest.mark.parametrize( + "short_path,resolved", + [ + # single strings should be found in {obs,var}{.columns,_names} + # but not in {obs,var}m DataFrame columns (too deep and obscure) + ("Cell1", ("layers", "X", "obs", "Cell1")), + ("group", ("obs", "group")), + # keys with subpaths should be found in layers, {obs,var}{p,m} + ("X_pca/1", ("obsm", "X_pca", 1)), + ("unspliced/GeneY", ("layers", "unspliced", "var", "GeneY")), + # {obs,var}p should default to axis 0 + (("neighbors_distances", "Cell2"), ("obsp", "neighbors_distances", "Cell2", 0)), + ], + ids=repr, +) +def test_resolve(adata, short_path, resolved): + if isinstance(short_path, str): + short_path = (short_path,) + assert resolve_path(adata, *short_path) == RefPath(*resolved) diff --git a/anndata/utils.py b/anndata/utils.py index 9e74c266b..6f055e2f2 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -1,5 +1,6 @@ import warnings from functools import wraps, singledispatch +from textwrap import dedent from typing import Mapping, Any, Sequence import pandas as pd @@ -170,3 +171,16 @@ def is_deprecated(attr): for item in type.__dir__(cls) if not is_deprecated(getattr(cls, item, None)) ] + + +def _doc_params(**kwds): + """\ + Docstrings should start with "\" in the first line for proper formatting. + """ + + def dec(obj): + obj.__orig_doc__ = obj.__doc__ + obj.__doc__ = dedent(obj.__doc__).format_map(kwds) + return obj + + return dec diff --git a/docs/conf.py b/docs/conf.py index 7c359131c..7019c6566 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -105,8 +105,10 @@ def setup(app: Sphinx): ) qualname_overrides = { "anndata._core.anndata.AnnData": "anndata.AnnData", + "anndata._core.ref_path.RefPath": "anndata.RefPath", + "anndata._core.raw.Raw": "anndata.Raw", # Temporarily - "anndata._core.raw.Raw": "anndata.AnnData", + "anndata._core.sparse_dataset.SparseDataset": "scipy.sparse.spmatrix", "anndata._core.views.ArrayView": "numpy.ndarray", **{ f"anndata._core.aligned_mapping.{cls}{kind}": "typing.Mapping" diff --git a/docs/index.rst b/docs/index.rst index 87228fc47..8cf48edbf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,4 +20,5 @@ of data and learned annotations. It was initially built for api fileformat-prose benchmarks + internal-api references diff --git a/docs/internal-api.rst b/docs/internal-api.rst new file mode 100644 index 000000000..2da420d1e --- /dev/null +++ b/docs/internal-api.rst @@ -0,0 +1,12 @@ +Internal API +------------ + +Do not rely on anything in here, it can and will change without notice. + +.. currentmodule:: anndata + +.. autosummary:: + :toctree: . + + Raw + RefPath