Skip to content

Commit

Permalink
Optimize constructor time for tiledb-cloud case (#222)
Browse files Browse the repository at this point in the history
* debug

* caching

* more caching

* more

* lint

* no exists-caching

* neaten, and comment
  • Loading branch information
johnkerl authored Jul 15, 2022
1 parent 657ef7f commit 8d8c94d
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 31 deletions.
16 changes: 8 additions & 8 deletions apis/python/src/tiledbsc/raw_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,24 +35,24 @@ def __init__(
super().__init__(uri=uri, name=name, parent=parent)
self.parent_obs = obs

X_uri = self._get_child_uri("X") # See comments in that function
var_uri = self._get_child_uri("var")
varm_uri = self._get_child_uri("varm")
varp_uri = self._get_child_uri("varp")
# See comments in _get_child_uris
child_uris = self._get_child_uris(["X", "var", "varm", "varp"])

self.var = AnnotationDataFrame(uri=var_uri, name="var", parent=self)
self.var = AnnotationDataFrame(uri=child_uris["var"], name="var", parent=self)
self.X = AssayMatrixGroup(
uri=X_uri,
uri=child_uris["X"],
name="X",
row_dim_name="obs_id",
col_dim_name="var_id",
row_dataframe=self.parent_obs,
col_dataframe=self.var,
parent=self,
)
self.varm = AnnotationMatrixGroup(uri=varm_uri, name="varm", parent=self)
self.varm = AnnotationMatrixGroup(
uri=child_uris["varm"], name="varm", parent=self
)
self.varp = AnnotationPairwiseMatrixGroup(
uri=varp_uri,
uri=child_uris["varp"],
name="varp",
row_dataframe=self.var,
col_dataframe=self.var,
Expand Down
38 changes: 20 additions & 18 deletions apis/python/src/tiledbsc/soma.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,20 @@ def __init__(
ctx=ctx,
)

obs_uri = self._get_child_uri("obs") # See comments in that function
var_uri = self._get_child_uri("var")
X_uri = self._get_child_uri("X")
obsm_uri = self._get_child_uri("obsm")
varm_uri = self._get_child_uri("varm")
obsp_uri = self._get_child_uri("obsp")
varp_uri = self._get_child_uri("varp")
raw_uri = self._get_child_uri("raw")
uns_uri = self._get_child_uri("uns")
# See comments in _get_child_uris
child_uris = self._get_child_uris(
["obs", "var", "X", "obsm", "varm", "obsp", "varp", "raw", "uns"]
)

obs_uri = child_uris["obs"]
var_uri = child_uris["var"]
X_uri = child_uris["X"]
obsm_uri = child_uris["obsm"]
varm_uri = child_uris["varm"]
obsp_uri = child_uris["obsp"]
varp_uri = child_uris["varp"]
raw_uri = child_uris["raw"]
uns_uri = child_uris["uns"]

self.obs = AnnotationDataFrame(uri=obs_uri, name="obs", parent=self)
self.var = AnnotationDataFrame(uri=var_uri, name="var", parent=self)
Expand Down Expand Up @@ -129,15 +134,12 @@ def __init__(
self.raw = RawGroup(uri=raw_uri, name="raw", obs=self.obs, parent=self)
self.uns = UnsGroup(uri=uns_uri, name="uns", parent=self)

# If URI is "/something/test1" then:
# * obs_uri is "/something/test1/obs"
# * var_uri is "/something/test1/var"
# * data_uri is "/something/test1/X"

# If URI is "tiledb://namespace/s3://bucketname/something/test1" then:
# * obs_uri is "tiledb://namespace/s3://bucketname/something/test1/obs"
# * var_uri is "tiledb://namespace/s3://bucketname/something/test1/var"
# * data_uri is "tiledb://namespace/s3://bucketname/something/test1/X"
# Sample SOMA/member URIs:
#
# SOMA member
# "/foo/test1" "/foo/test1/obs"
# "tiledb://ns/s3://bkt/foo/test1" "tiledb://ns/s3://bkt/foo/test1/obs"
# "tiledb://ns/test1" "tiledb://ns/95cb11b5-2052-461b-9b99-cfa94e51e23f"

# ----------------------------------------------------------------
def __repr__(self) -> str:
Expand Down
19 changes: 17 additions & 2 deletions apis/python/src/tiledbsc/soma_collection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterator, Optional, Sequence, Set, Union
from typing import Dict, Iterator, Optional, Sequence, Set, Union

import tiledb

Expand All @@ -14,6 +14,12 @@ class SOMACollection(TileDBGroup):
Implements a collection of `SOMA` objects.
"""

# This is a cache to avoid the overhead of calling the SOMA constructor repeatedly. That
# constructor isn't particuarly expensive, except that for tiledb-cloud URIs it needs to
# interrogate the server repeatedly for recursive group-member URIs, which has web-request
# latency.
_somas: Dict[str, SOMA]

# ----------------------------------------------------------------
def __init__(
self,
Expand Down Expand Up @@ -58,6 +64,8 @@ def __init__(
ctx=ctx,
)

self._somas = {}

# ----------------------------------------------------------------
def __repr__(self) -> str:
"""
Expand Down Expand Up @@ -121,7 +129,10 @@ def __iter__(self) -> Iterator[SOMA]:
Implements `for soma in soco: ...`
"""
for name, uri in self._get_member_names_to_uris().items():
yield SOMA(uri=uri, name=name, parent=self, ctx=self._ctx)
if name not in self._somas:
# SOMA-constructor cache
self._somas[name] = SOMA(uri=uri, name=name, parent=self, ctx=self._ctx)
yield self._somas[name]

# ----------------------------------------------------------------
def __contains__(self, name: str) -> bool:
Expand Down Expand Up @@ -152,6 +163,10 @@ def __getitem__(self, name: str) -> Optional[SOMA]:
Returns a `SOMA` element at the given name within the group, or `None` if no such
member exists. Overloads the `[...]` operator.
"""
if name in self._somas:
# SOMA-constructor cache
return self._somas[name]

with self._open("r") as G:
try:
obj = G[name] # This returns a tiledb.object.Object.
Expand Down
52 changes: 50 additions & 2 deletions apis/python/src/tiledbsc/tiledb_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class TileDBGroup(TileDBObject):
Wraps groups from TileDB-Py by retaining a URI, options, etc.
"""

# Cache to avoid repeated calls to the REST server for resolving group-member URIs
# in the tiledb-cloud case. We invalidate this on add-member or remove-member.
_cached_member_names_to_uris: Optional[Dict[str, str]]

def __init__(
self,
uri: str,
Expand All @@ -30,13 +34,17 @@ def __init__(
See the TileDBObject constructor.
"""
super().__init__(uri, name, parent=parent, soma_options=soma_options, ctx=ctx)
self._cached_member_names_to_uris = None

def exists(self) -> bool:
"""
Tells whether or not there is storage for the group. This might be in case a SOMA
object has not yet been populated, e.g. before calling `from_anndata` -- or, if the
SOMA has been populated but doesn't have this member (e.g. not all SOMAs have a `varp`).
"""
# For tiledb:// URIs this is a REST-server request which we'd like to cache.
# However, remove-and-replace use-cases are possible and common in notebooks
# and it turns out caching the existence-check isn't a robust approach.
return bool(tiledb.object_type(self.uri, ctx=self._ctx) == "group")

def create_unless_exists(self) -> None:
Expand Down Expand Up @@ -85,13 +93,49 @@ def _open(self, mode: str = "r") -> tiledb.Group:
# This works in with-open-as contexts because tiledb.Group has __enter__ and __exit__ methods.
return tiledb.Group(self.uri, mode=mode, ctx=self._ctx)

def _get_child_uris(self, member_names: Sequence[str]) -> Dict[str, str]:
"""
Batched version of `get_child_uri`. Since there's REST-server latency for getting
name-to-URI mapping for group-member URIs, in the tiledb://... case, total latency
is reduced when we ask for all group-element name-to-URI mappings in a single
request to the REST server.
"""
if not self.exists():
# Group not constructed yet. Here, appending "/" and name is appropriate in all
# cases: even for tiledb://... URIs, pre-construction URIs are of the form
# tiledb://namespace/s3://something/something/soma/membername.
return {
member_name: self.uri + "/" + member_name
for member_name in member_names
}

answer = {}

mapping = self._get_member_names_to_uris()
for member_name in member_names:
if member_name in mapping:
answer[member_name] = mapping[member_name]
else:
# Truly a slash, not os.path.join:
# * If the client is Linux/Un*x/Mac, it's the same of course
# * On Windows, os.path.sep is a backslash but backslashes are _not_ accepted for S3 or
# tiledb-cloud URIs, whereas in Windows versions for years now forward slashes _are_
# accepted for local-disk paths.
# This means forward slash is acceptable in all cases.
answer[member_name] = self.uri + "/" + member_name

return answer

def _get_child_uri(self, member_name: str) -> str:
"""
Computes the URI for a child of the given object. For local disk, S3, and
tiledb://.../s3://... pre-creation URIs, this is simply the parent's URI, a slash, and the
member name. For post-creation TileDB-Cloud URIs, this is computed from the parent's
information. (This is because in TileDB Cloud, members have URIs like
tiledb://namespace/df584345-28b7-45e5-abeb-043d409b1a97.)
Please use _get_child_uris whenever possible, to reduce the number of REST-server requests
in the tiledb//... URIs.
"""
if not self.exists():
# TODO: comment
Expand Down Expand Up @@ -141,6 +185,7 @@ def _add_object(self, obj: TileDBObject, relative: Optional[bool] = None) -> Non
relative = not child_uri.startswith("tiledb://")
if relative:
child_uri = obj.name
self._cached_member_names_to_uris = None # invalidate on add-member
with self._open("w") as G:
G.add(uri=child_uri, relative=relative, name=obj.name)
# See _get_child_uri. Key point is that, on TileDB Cloud, URIs change from pre-creation to
Expand All @@ -156,6 +201,7 @@ def _remove_object(self, obj: TileDBObject) -> None:
self._remove_object_by_name(obj.name)

def _remove_object_by_name(self, member_name: str) -> None:
self._cached_member_names_to_uris = None # invalidate on remove-member
if self.uri.startswith("tiledb://"):
mapping = self._get_member_names_to_uris()
if member_name not in mapping:
Expand Down Expand Up @@ -186,8 +232,10 @@ def _get_member_names_to_uris(self) -> Dict[str, str]:
Like `_get_member_names()` and `_get_member_uris`, but returns a dict mapping from
member name to member URI.
"""
with self._open("r") as G:
return {obj.name: obj.uri for obj in G}
if self._cached_member_names_to_uris is None:
with self._open("r") as G:
self._cached_member_names_to_uris = {obj.name: obj.uri for obj in G}
return self._cached_member_names_to_uris

def show_metadata(self, recursively: bool = True, indent: str = "") -> None:
"""
Expand Down
5 changes: 4 additions & 1 deletion apis/python/src/tiledbsc/uns_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,11 @@ def from_anndata_uns(self, uns: Mapping[str, Any]) -> None:
# Must be done first, to create the parent directory
self.create_unless_exists()

# See comments in _get_child_uris
child_uris = self._get_child_uris(list(uns.keys()))

for key in uns.keys():
component_uri = self._get_child_uri(key) # See comments in that function
component_uri = child_uris[key]
value = uns[key]

if key == "rank_genes_groups":
Expand Down

0 comments on commit 8d8c94d

Please sign in to comment.