From 8d8c94d5245f19d9f679d4be8fe26241a8473410 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 15 Jul 2022 16:38:01 -0400 Subject: [PATCH] Optimize constructor time for tiledb-cloud case (#222) * debug * caching * more caching * more * lint * no exists-caching * neaten, and comment --- apis/python/src/tiledbsc/raw_group.py | 16 +++---- apis/python/src/tiledbsc/soma.py | 38 ++++++++------- apis/python/src/tiledbsc/soma_collection.py | 19 +++++++- apis/python/src/tiledbsc/tiledb_group.py | 52 ++++++++++++++++++++- apis/python/src/tiledbsc/uns_group.py | 5 +- 5 files changed, 99 insertions(+), 31 deletions(-) diff --git a/apis/python/src/tiledbsc/raw_group.py b/apis/python/src/tiledbsc/raw_group.py index 33852c8260..0e40529fa9 100644 --- a/apis/python/src/tiledbsc/raw_group.py +++ b/apis/python/src/tiledbsc/raw_group.py @@ -35,14 +35,12 @@ def __init__( super().__init__(uri=uri, name=name, parent=parent) self.parent_obs = obs - X_uri = self._get_child_uri("X") # See comments in that function - var_uri = self._get_child_uri("var") - varm_uri = self._get_child_uri("varm") - varp_uri = self._get_child_uri("varp") + # See comments in _get_child_uris + child_uris = self._get_child_uris(["X", "var", "varm", "varp"]) - self.var = AnnotationDataFrame(uri=var_uri, name="var", parent=self) + self.var = AnnotationDataFrame(uri=child_uris["var"], name="var", parent=self) self.X = AssayMatrixGroup( - uri=X_uri, + uri=child_uris["X"], name="X", row_dim_name="obs_id", col_dim_name="var_id", @@ -50,9 +48,11 @@ def __init__( col_dataframe=self.var, parent=self, ) - self.varm = AnnotationMatrixGroup(uri=varm_uri, name="varm", parent=self) + self.varm = AnnotationMatrixGroup( + uri=child_uris["varm"], name="varm", parent=self + ) self.varp = AnnotationPairwiseMatrixGroup( - uri=varp_uri, + uri=child_uris["varp"], name="varp", row_dataframe=self.var, col_dataframe=self.var, diff --git a/apis/python/src/tiledbsc/soma.py b/apis/python/src/tiledbsc/soma.py index 009149e336..3d8fe19f9b 100644 --- a/apis/python/src/tiledbsc/soma.py +++ b/apis/python/src/tiledbsc/soma.py @@ -89,15 +89,20 @@ def __init__( ctx=ctx, ) - obs_uri = self._get_child_uri("obs") # See comments in that function - var_uri = self._get_child_uri("var") - X_uri = self._get_child_uri("X") - obsm_uri = self._get_child_uri("obsm") - varm_uri = self._get_child_uri("varm") - obsp_uri = self._get_child_uri("obsp") - varp_uri = self._get_child_uri("varp") - raw_uri = self._get_child_uri("raw") - uns_uri = self._get_child_uri("uns") + # See comments in _get_child_uris + child_uris = self._get_child_uris( + ["obs", "var", "X", "obsm", "varm", "obsp", "varp", "raw", "uns"] + ) + + obs_uri = child_uris["obs"] + var_uri = child_uris["var"] + X_uri = child_uris["X"] + obsm_uri = child_uris["obsm"] + varm_uri = child_uris["varm"] + obsp_uri = child_uris["obsp"] + varp_uri = child_uris["varp"] + raw_uri = child_uris["raw"] + uns_uri = child_uris["uns"] self.obs = AnnotationDataFrame(uri=obs_uri, name="obs", parent=self) self.var = AnnotationDataFrame(uri=var_uri, name="var", parent=self) @@ -129,15 +134,12 @@ def __init__( self.raw = RawGroup(uri=raw_uri, name="raw", obs=self.obs, parent=self) self.uns = UnsGroup(uri=uns_uri, name="uns", parent=self) - # If URI is "/something/test1" then: - # * obs_uri is "/something/test1/obs" - # * var_uri is "/something/test1/var" - # * data_uri is "/something/test1/X" - - # If URI is "tiledb://namespace/s3://bucketname/something/test1" then: - # * obs_uri is "tiledb://namespace/s3://bucketname/something/test1/obs" - # * var_uri is "tiledb://namespace/s3://bucketname/something/test1/var" - # * data_uri is "tiledb://namespace/s3://bucketname/something/test1/X" + # Sample SOMA/member URIs: + # + # SOMA member + # "/foo/test1" "/foo/test1/obs" + # "tiledb://ns/s3://bkt/foo/test1" "tiledb://ns/s3://bkt/foo/test1/obs" + # "tiledb://ns/test1" "tiledb://ns/95cb11b5-2052-461b-9b99-cfa94e51e23f" # ---------------------------------------------------------------- def __repr__(self) -> str: diff --git a/apis/python/src/tiledbsc/soma_collection.py b/apis/python/src/tiledbsc/soma_collection.py index 77a22ddf17..3a8a5cd976 100644 --- a/apis/python/src/tiledbsc/soma_collection.py +++ b/apis/python/src/tiledbsc/soma_collection.py @@ -1,4 +1,4 @@ -from typing import Iterator, Optional, Sequence, Set, Union +from typing import Dict, Iterator, Optional, Sequence, Set, Union import tiledb @@ -14,6 +14,12 @@ class SOMACollection(TileDBGroup): Implements a collection of `SOMA` objects. """ + # This is a cache to avoid the overhead of calling the SOMA constructor repeatedly. That + # constructor isn't particuarly expensive, except that for tiledb-cloud URIs it needs to + # interrogate the server repeatedly for recursive group-member URIs, which has web-request + # latency. + _somas: Dict[str, SOMA] + # ---------------------------------------------------------------- def __init__( self, @@ -58,6 +64,8 @@ def __init__( ctx=ctx, ) + self._somas = {} + # ---------------------------------------------------------------- def __repr__(self) -> str: """ @@ -121,7 +129,10 @@ def __iter__(self) -> Iterator[SOMA]: Implements `for soma in soco: ...` """ for name, uri in self._get_member_names_to_uris().items(): - yield SOMA(uri=uri, name=name, parent=self, ctx=self._ctx) + if name not in self._somas: + # SOMA-constructor cache + self._somas[name] = SOMA(uri=uri, name=name, parent=self, ctx=self._ctx) + yield self._somas[name] # ---------------------------------------------------------------- def __contains__(self, name: str) -> bool: @@ -152,6 +163,10 @@ def __getitem__(self, name: str) -> Optional[SOMA]: Returns a `SOMA` element at the given name within the group, or `None` if no such member exists. Overloads the `[...]` operator. """ + if name in self._somas: + # SOMA-constructor cache + return self._somas[name] + with self._open("r") as G: try: obj = G[name] # This returns a tiledb.object.Object. diff --git a/apis/python/src/tiledbsc/tiledb_group.py b/apis/python/src/tiledbsc/tiledb_group.py index a486fb975f..fdcc4321ea 100644 --- a/apis/python/src/tiledbsc/tiledb_group.py +++ b/apis/python/src/tiledbsc/tiledb_group.py @@ -15,6 +15,10 @@ class TileDBGroup(TileDBObject): Wraps groups from TileDB-Py by retaining a URI, options, etc. """ + # Cache to avoid repeated calls to the REST server for resolving group-member URIs + # in the tiledb-cloud case. We invalidate this on add-member or remove-member. + _cached_member_names_to_uris: Optional[Dict[str, str]] + def __init__( self, uri: str, @@ -30,6 +34,7 @@ def __init__( See the TileDBObject constructor. """ super().__init__(uri, name, parent=parent, soma_options=soma_options, ctx=ctx) + self._cached_member_names_to_uris = None def exists(self) -> bool: """ @@ -37,6 +42,9 @@ def exists(self) -> bool: object has not yet been populated, e.g. before calling `from_anndata` -- or, if the SOMA has been populated but doesn't have this member (e.g. not all SOMAs have a `varp`). """ + # For tiledb:// URIs this is a REST-server request which we'd like to cache. + # However, remove-and-replace use-cases are possible and common in notebooks + # and it turns out caching the existence-check isn't a robust approach. return bool(tiledb.object_type(self.uri, ctx=self._ctx) == "group") def create_unless_exists(self) -> None: @@ -85,6 +93,39 @@ def _open(self, mode: str = "r") -> tiledb.Group: # This works in with-open-as contexts because tiledb.Group has __enter__ and __exit__ methods. return tiledb.Group(self.uri, mode=mode, ctx=self._ctx) + def _get_child_uris(self, member_names: Sequence[str]) -> Dict[str, str]: + """ + Batched version of `get_child_uri`. Since there's REST-server latency for getting + name-to-URI mapping for group-member URIs, in the tiledb://... case, total latency + is reduced when we ask for all group-element name-to-URI mappings in a single + request to the REST server. + """ + if not self.exists(): + # Group not constructed yet. Here, appending "/" and name is appropriate in all + # cases: even for tiledb://... URIs, pre-construction URIs are of the form + # tiledb://namespace/s3://something/something/soma/membername. + return { + member_name: self.uri + "/" + member_name + for member_name in member_names + } + + answer = {} + + mapping = self._get_member_names_to_uris() + for member_name in member_names: + if member_name in mapping: + answer[member_name] = mapping[member_name] + else: + # Truly a slash, not os.path.join: + # * If the client is Linux/Un*x/Mac, it's the same of course + # * On Windows, os.path.sep is a backslash but backslashes are _not_ accepted for S3 or + # tiledb-cloud URIs, whereas in Windows versions for years now forward slashes _are_ + # accepted for local-disk paths. + # This means forward slash is acceptable in all cases. + answer[member_name] = self.uri + "/" + member_name + + return answer + def _get_child_uri(self, member_name: str) -> str: """ Computes the URI for a child of the given object. For local disk, S3, and @@ -92,6 +133,9 @@ def _get_child_uri(self, member_name: str) -> str: member name. For post-creation TileDB-Cloud URIs, this is computed from the parent's information. (This is because in TileDB Cloud, members have URIs like tiledb://namespace/df584345-28b7-45e5-abeb-043d409b1a97.) + + Please use _get_child_uris whenever possible, to reduce the number of REST-server requests + in the tiledb//... URIs. """ if not self.exists(): # TODO: comment @@ -141,6 +185,7 @@ def _add_object(self, obj: TileDBObject, relative: Optional[bool] = None) -> Non relative = not child_uri.startswith("tiledb://") if relative: child_uri = obj.name + self._cached_member_names_to_uris = None # invalidate on add-member with self._open("w") as G: G.add(uri=child_uri, relative=relative, name=obj.name) # See _get_child_uri. Key point is that, on TileDB Cloud, URIs change from pre-creation to @@ -156,6 +201,7 @@ def _remove_object(self, obj: TileDBObject) -> None: self._remove_object_by_name(obj.name) def _remove_object_by_name(self, member_name: str) -> None: + self._cached_member_names_to_uris = None # invalidate on remove-member if self.uri.startswith("tiledb://"): mapping = self._get_member_names_to_uris() if member_name not in mapping: @@ -186,8 +232,10 @@ def _get_member_names_to_uris(self) -> Dict[str, str]: Like `_get_member_names()` and `_get_member_uris`, but returns a dict mapping from member name to member URI. """ - with self._open("r") as G: - return {obj.name: obj.uri for obj in G} + if self._cached_member_names_to_uris is None: + with self._open("r") as G: + self._cached_member_names_to_uris = {obj.name: obj.uri for obj in G} + return self._cached_member_names_to_uris def show_metadata(self, recursively: bool = True, indent: str = "") -> None: """ diff --git a/apis/python/src/tiledbsc/uns_group.py b/apis/python/src/tiledbsc/uns_group.py index c2772e7dc6..6a36208df3 100644 --- a/apis/python/src/tiledbsc/uns_group.py +++ b/apis/python/src/tiledbsc/uns_group.py @@ -143,8 +143,11 @@ def from_anndata_uns(self, uns: Mapping[str, Any]) -> None: # Must be done first, to create the parent directory self.create_unless_exists() + # See comments in _get_child_uris + child_uris = self._get_child_uris(list(uns.keys())) + for key in uns.keys(): - component_uri = self._get_child_uri(key) # See comments in that function + component_uri = child_uris[key] value = uns[key] if key == "rank_genes_groups":