Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize constructor time for tiledb-cloud case #222

Merged
merged 7 commits into from
Jul 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions apis/python/src/tiledbsc/raw_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,24 +35,24 @@ def __init__(
super().__init__(uri=uri, name=name, parent=parent)
self.parent_obs = obs

X_uri = self._get_child_uri("X") # See comments in that function
var_uri = self._get_child_uri("var")
varm_uri = self._get_child_uri("varm")
varp_uri = self._get_child_uri("varp")
# See comments in _get_child_uris
child_uris = self._get_child_uris(["X", "var", "varm", "varp"])

self.var = AnnotationDataFrame(uri=var_uri, name="var", parent=self)
self.var = AnnotationDataFrame(uri=child_uris["var"], name="var", parent=self)
self.X = AssayMatrixGroup(
uri=X_uri,
uri=child_uris["X"],
name="X",
row_dim_name="obs_id",
col_dim_name="var_id",
row_dataframe=self.parent_obs,
col_dataframe=self.var,
parent=self,
)
self.varm = AnnotationMatrixGroup(uri=varm_uri, name="varm", parent=self)
self.varm = AnnotationMatrixGroup(
uri=child_uris["varm"], name="varm", parent=self
)
self.varp = AnnotationPairwiseMatrixGroup(
uri=varp_uri,
uri=child_uris["varp"],
name="varp",
row_dataframe=self.var,
col_dataframe=self.var,
Expand Down
38 changes: 20 additions & 18 deletions apis/python/src/tiledbsc/soma.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,20 @@ def __init__(
ctx=ctx,
)

obs_uri = self._get_child_uri("obs") # See comments in that function
var_uri = self._get_child_uri("var")
X_uri = self._get_child_uri("X")
obsm_uri = self._get_child_uri("obsm")
varm_uri = self._get_child_uri("varm")
obsp_uri = self._get_child_uri("obsp")
varp_uri = self._get_child_uri("varp")
raw_uri = self._get_child_uri("raw")
uns_uri = self._get_child_uri("uns")
# See comments in _get_child_uris
child_uris = self._get_child_uris(
["obs", "var", "X", "obsm", "varm", "obsp", "varp", "raw", "uns"]
)

obs_uri = child_uris["obs"]
var_uri = child_uris["var"]
X_uri = child_uris["X"]
obsm_uri = child_uris["obsm"]
varm_uri = child_uris["varm"]
obsp_uri = child_uris["obsp"]
varp_uri = child_uris["varp"]
raw_uri = child_uris["raw"]
uns_uri = child_uris["uns"]

self.obs = AnnotationDataFrame(uri=obs_uri, name="obs", parent=self)
self.var = AnnotationDataFrame(uri=var_uri, name="var", parent=self)
Expand Down Expand Up @@ -129,15 +134,12 @@ def __init__(
self.raw = RawGroup(uri=raw_uri, name="raw", obs=self.obs, parent=self)
self.uns = UnsGroup(uri=uns_uri, name="uns", parent=self)

# If URI is "/something/test1" then:
# * obs_uri is "/something/test1/obs"
# * var_uri is "/something/test1/var"
# * data_uri is "/something/test1/X"

# If URI is "tiledb://namespace/s3://bucketname/something/test1" then:
# * obs_uri is "tiledb://namespace/s3://bucketname/something/test1/obs"
# * var_uri is "tiledb://namespace/s3://bucketname/something/test1/var"
# * data_uri is "tiledb://namespace/s3://bucketname/something/test1/X"
# Sample SOMA/member URIs:
#
# SOMA member
# "/foo/test1" "/foo/test1/obs"
# "tiledb://ns/s3://bkt/foo/test1" "tiledb://ns/s3://bkt/foo/test1/obs"
# "tiledb://ns/test1" "tiledb://ns/95cb11b5-2052-461b-9b99-cfa94e51e23f"

# ----------------------------------------------------------------
def __repr__(self) -> str:
Expand Down
19 changes: 17 additions & 2 deletions apis/python/src/tiledbsc/soma_collection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Iterator, Optional, Sequence, Set, Union
from typing import Dict, Iterator, Optional, Sequence, Set, Union

import tiledb

Expand All @@ -14,6 +14,12 @@ class SOMACollection(TileDBGroup):
Implements a collection of `SOMA` objects.
"""

# This is a cache to avoid the overhead of calling the SOMA constructor repeatedly. That
# constructor isn't particuarly expensive, except that for tiledb-cloud URIs it needs to
# interrogate the server repeatedly for recursive group-member URIs, which has web-request
# latency.
_somas: Dict[str, SOMA]

# ----------------------------------------------------------------
def __init__(
self,
Expand Down Expand Up @@ -58,6 +64,8 @@ def __init__(
ctx=ctx,
)

self._somas = {}

# ----------------------------------------------------------------
def __repr__(self) -> str:
"""
Expand Down Expand Up @@ -121,7 +129,10 @@ def __iter__(self) -> Iterator[SOMA]:
Implements `for soma in soco: ...`
"""
for name, uri in self._get_member_names_to_uris().items():
yield SOMA(uri=uri, name=name, parent=self, ctx=self._ctx)
if name not in self._somas:
# SOMA-constructor cache
self._somas[name] = SOMA(uri=uri, name=name, parent=self, ctx=self._ctx)
yield self._somas[name]

# ----------------------------------------------------------------
def __contains__(self, name: str) -> bool:
Expand Down Expand Up @@ -152,6 +163,10 @@ def __getitem__(self, name: str) -> Optional[SOMA]:
Returns a `SOMA` element at the given name within the group, or `None` if no such
member exists. Overloads the `[...]` operator.
"""
if name in self._somas:
# SOMA-constructor cache
return self._somas[name]

with self._open("r") as G:
try:
obj = G[name] # This returns a tiledb.object.Object.
Expand Down
52 changes: 50 additions & 2 deletions apis/python/src/tiledbsc/tiledb_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ class TileDBGroup(TileDBObject):
Wraps groups from TileDB-Py by retaining a URI, options, etc.
"""

# Cache to avoid repeated calls to the REST server for resolving group-member URIs
# in the tiledb-cloud case. We invalidate this on add-member or remove-member.
_cached_member_names_to_uris: Optional[Dict[str, str]]

def __init__(
self,
uri: str,
Expand All @@ -30,13 +34,17 @@ def __init__(
See the TileDBObject constructor.
"""
super().__init__(uri, name, parent=parent, soma_options=soma_options, ctx=ctx)
self._cached_member_names_to_uris = None

def exists(self) -> bool:
"""
Tells whether or not there is storage for the group. This might be in case a SOMA
object has not yet been populated, e.g. before calling `from_anndata` -- or, if the
SOMA has been populated but doesn't have this member (e.g. not all SOMAs have a `varp`).
"""
# For tiledb:// URIs this is a REST-server request which we'd like to cache.
# However, remove-and-replace use-cases are possible and common in notebooks
# and it turns out caching the existence-check isn't a robust approach.
return bool(tiledb.object_type(self.uri, ctx=self._ctx) == "group")

def create_unless_exists(self) -> None:
Expand Down Expand Up @@ -85,13 +93,49 @@ def _open(self, mode: str = "r") -> tiledb.Group:
# This works in with-open-as contexts because tiledb.Group has __enter__ and __exit__ methods.
return tiledb.Group(self.uri, mode=mode, ctx=self._ctx)

def _get_child_uris(self, member_names: Sequence[str]) -> Dict[str, str]:
"""
Batched version of `get_child_uri`. Since there's REST-server latency for getting
name-to-URI mapping for group-member URIs, in the tiledb://... case, total latency
is reduced when we ask for all group-element name-to-URI mappings in a single
request to the REST server.
"""
if not self.exists():
# Group not constructed yet. Here, appending "/" and name is appropriate in all
# cases: even for tiledb://... URIs, pre-construction URIs are of the form
# tiledb://namespace/s3://something/something/soma/membername.
return {
member_name: self.uri + "/" + member_name
for member_name in member_names
}

answer = {}

mapping = self._get_member_names_to_uris()
for member_name in member_names:
if member_name in mapping:
answer[member_name] = mapping[member_name]
else:
# Truly a slash, not os.path.join:
# * If the client is Linux/Un*x/Mac, it's the same of course
# * On Windows, os.path.sep is a backslash but backslashes are _not_ accepted for S3 or
# tiledb-cloud URIs, whereas in Windows versions for years now forward slashes _are_
# accepted for local-disk paths.
# This means forward slash is acceptable in all cases.
answer[member_name] = self.uri + "/" + member_name

return answer

def _get_child_uri(self, member_name: str) -> str:
"""
Computes the URI for a child of the given object. For local disk, S3, and
tiledb://.../s3://... pre-creation URIs, this is simply the parent's URI, a slash, and the
member name. For post-creation TileDB-Cloud URIs, this is computed from the parent's
information. (This is because in TileDB Cloud, members have URIs like
tiledb://namespace/df584345-28b7-45e5-abeb-043d409b1a97.)

Please use _get_child_uris whenever possible, to reduce the number of REST-server requests
in the tiledb//... URIs.
"""
if not self.exists():
# TODO: comment
Expand Down Expand Up @@ -141,6 +185,7 @@ def _add_object(self, obj: TileDBObject, relative: Optional[bool] = None) -> Non
relative = not child_uri.startswith("tiledb://")
if relative:
child_uri = obj.name
self._cached_member_names_to_uris = None # invalidate on add-member
with self._open("w") as G:
G.add(uri=child_uri, relative=relative, name=obj.name)
# See _get_child_uri. Key point is that, on TileDB Cloud, URIs change from pre-creation to
Expand All @@ -156,6 +201,7 @@ def _remove_object(self, obj: TileDBObject) -> None:
self._remove_object_by_name(obj.name)

def _remove_object_by_name(self, member_name: str) -> None:
self._cached_member_names_to_uris = None # invalidate on remove-member
if self.uri.startswith("tiledb://"):
mapping = self._get_member_names_to_uris()
if member_name not in mapping:
Expand Down Expand Up @@ -186,8 +232,10 @@ def _get_member_names_to_uris(self) -> Dict[str, str]:
Like `_get_member_names()` and `_get_member_uris`, but returns a dict mapping from
member name to member URI.
"""
with self._open("r") as G:
return {obj.name: obj.uri for obj in G}
if self._cached_member_names_to_uris is None:
with self._open("r") as G:
self._cached_member_names_to_uris = {obj.name: obj.uri for obj in G}
return self._cached_member_names_to_uris

def show_metadata(self, recursively: bool = True, indent: str = "") -> None:
"""
Expand Down
5 changes: 4 additions & 1 deletion apis/python/src/tiledbsc/uns_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,11 @@ def from_anndata_uns(self, uns: Mapping[str, Any]) -> None:
# Must be done first, to create the parent directory
self.create_unless_exists()

# See comments in _get_child_uris
child_uris = self._get_child_uris(list(uns.keys()))

for key in uns.keys():
component_uri = self._get_child_uri(key) # See comments in that function
component_uri = child_uris[key]
value = uns[key]

if key == "rank_genes_groups":
Expand Down