From b424f59611fb442a472429f0f749794be5dfff34 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 13 Jul 2022 17:26:16 -0400 Subject: [PATCH] caching --- apis/python/src/tiledbsc/soma.py | 28 ------------ apis/python/src/tiledbsc/tiledb_group.py | 54 ++++++++++++------------ 2 files changed, 27 insertions(+), 55 deletions(-) diff --git a/apis/python/src/tiledbsc/soma.py b/apis/python/src/tiledbsc/soma.py index 8f4c8d491c..6b921e0511 100644 --- a/apis/python/src/tiledbsc/soma.py +++ b/apis/python/src/tiledbsc/soma.py @@ -3,7 +3,6 @@ import os from collections import Counter from typing import Optional, Sequence -import time import pandas as pd import tiledb @@ -60,9 +59,6 @@ def __init__( :param uri: URI of the TileDB group """ - t1 = time.time() - - print(time.time(), "AAA001") # People can (and should) call by name. However, it's easy to forget. For example, # if someone does 'tiledbsc.SOMA("myuri", ctx)' instead of 'tiledbsc.SOMA("myury", ctx)', # behavior will not be what they expect, and we should let them know sooner than later. @@ -76,7 +72,6 @@ def __init__( assert isinstance(ctx, tiledb.Ctx) if parent is not None: assert isinstance(parent, TileDBGroup) - print(time.time(), "AAA002") if ctx is None and config is not None: ctx = tiledb.Ctx(config) @@ -86,7 +81,6 @@ def __init__( name = os.path.basename(uri.rstrip("/")) if name == "": name = "soma" - print(time.time(), "AAA003") super().__init__( uri=uri, name=name, @@ -95,8 +89,6 @@ def __init__( ctx=ctx, ) - # t01 = time.time() - # print(time.time(), "AAA004") # obs_uri = self._get_child_uri("obs") # See comments in that function # var_uri = self._get_child_uri("var") # X_uri = self._get_child_uri("X") @@ -106,11 +98,6 @@ def __init__( # varp_uri = self._get_child_uri("varp") # raw_uri = self._get_child_uri("raw") # uns_uri = self._get_child_uri("uns") - # t02 = time.time() - # print("BBB001 %.3f" % (t02-t01)) - - t01 = time.time() - print(time.time(), "AAA104") member_names = ["obs", "var", "X", "obsm", "varm", "obsp", "varp", "raw", "uns"] child_uris = self._get_child_uris(member_names) # See comments in that function @@ -125,14 +112,8 @@ def __init__( raw_uri = child_uris["raw"] uns_uri = child_uris["uns"] - t02 = time.time() - print("BBB001 %.3f" % (t02-t01)) - - print(time.time(), "AAA005") self.obs = AnnotationDataFrame(uri=obs_uri, name="obs", parent=self) - print(time.time(), "AAA006") self.var = AnnotationDataFrame(uri=var_uri, name="var", parent=self) - print(time.time(), "AAA007") self.X = AssayMatrixGroup( uri=X_uri, name="X", @@ -142,11 +123,8 @@ def __init__( col_dataframe=self.var, parent=self, ) - print(time.time(), "AAA008") self.obsm = AnnotationMatrixGroup(uri=obsm_uri, name="obsm", parent=self) - print(time.time(), "AAA009") self.varm = AnnotationMatrixGroup(uri=varm_uri, name="varm", parent=self) - print(time.time(), "AAA010") self.obsp = AnnotationPairwiseMatrixGroup( uri=obsp_uri, name="obsp", @@ -154,7 +132,6 @@ def __init__( col_dataframe=self.obs, parent=self, ) - print(time.time(), "AAA011") self.varp = AnnotationPairwiseMatrixGroup( uri=varp_uri, name="varp", @@ -162,9 +139,7 @@ def __init__( col_dataframe=self.var, parent=self, ) - print(time.time(), "AAA012") self.raw = RawGroup(uri=raw_uri, name="raw", obs=self.obs, parent=self) - print(time.time(), "AAA013") self.uns = UnsGroup(uri=uns_uri, name="uns", parent=self) # If URI is "/something/test1" then: @@ -177,9 +152,6 @@ def __init__( # * var_uri is "tiledb://namespace/s3://bucketname/something/test1/var" # * data_uri is "tiledb://namespace/s3://bucketname/something/test1/X" - t2 = time.time() - print("SOMA CTOR SECONDS %.3f" % (t2-t1)) - # ---------------------------------------------------------------- def __repr__(self) -> str: """ diff --git a/apis/python/src/tiledbsc/tiledb_group.py b/apis/python/src/tiledbsc/tiledb_group.py index 5028dfaea9..39bc4f75e4 100644 --- a/apis/python/src/tiledbsc/tiledb_group.py +++ b/apis/python/src/tiledbsc/tiledb_group.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Dict, Optional, Sequence +from typing import Dict, Optional, Sequence, List import time import tiledb @@ -16,6 +16,8 @@ class TileDBGroup(TileDBObject): Wraps groups from TileDB-Py by retaining a URI, options, etc. """ + _cached_member_names_to_uris: Dict[str, str] + def __init__( self, uri: str, @@ -31,6 +33,7 @@ def __init__( See the TileDBObject constructor. """ super().__init__(uri, name, parent=parent, soma_options=soma_options, ctx=ctx) + self._cached_member_names_to_uris = None def exists(self) -> bool: """ @@ -97,7 +100,10 @@ def _get_child_uris(self, member_names: List[str]) -> Dict[str, str]: """ if not self.exists(): # TODO: comment - return {member_name : self.uri + "/" + member_name for member_name in member_names} + return { + member_name: self.uri + "/" + member_name + for member_name in member_names + } answer = {} @@ -125,30 +131,20 @@ def _get_child_uri(self, member_name: str) -> str: information. (This is because in TileDB Cloud, members have URIs like tiledb://namespace/df584345-28b7-45e5-abeb-043d409b1a97.) """ - t1 = time.time() - print("--XXXENTER") if not self.exists(): # TODO: comment - print("--XXXEXIT1 %.3f" % (time.time() - t1)) return self.uri + "/" + member_name - with self._open() as G: - if member_name in G: - return G[member_name].uri - else: - return self.uri + "/" + member_name -# mapping = self._get_member_names_to_uris() -# if member_name in mapping: -# print("--XXXEXIT2 %.3f" % (time.time() - t1)) -# return mapping[member_name] -# else: -# # Truly a slash, not os.path.join: -# # * If the client is Linux/Un*x/Mac, it's the same of course -# # * On Windows, os.path.sep is a backslash but backslashes are _not_ accepted for S3 or -# # tiledb-cloud URIs, whereas in Windows versions for years now forward slashes _are_ -# # accepted for local-disk paths. -# # This means forward slash is acceptable in all cases. -# print("--XXXEXIT3 %.3f" % (time.time() - t1)) -# return self.uri + "/" + member_name + mapping = self._get_member_names_to_uris() + if member_name in mapping: + return mapping[member_name] + else: + # Truly a slash, not os.path.join: + # * If the client is Linux/Un*x/Mac, it's the same of course + # * On Windows, os.path.sep is a backslash but backslashes are _not_ accepted for S3 or + # tiledb-cloud URIs, whereas in Windows versions for years now forward slashes _are_ + # accepted for local-disk paths. + # This means forward slash is acceptable in all cases. + return self.uri + "/" + member_name def _add_object(self, obj: TileDBObject, relative: Optional[bool] = None) -> None: """ @@ -183,9 +179,10 @@ def _add_object(self, obj: TileDBObject, relative: Optional[bool] = None) -> Non relative = not child_uri.startswith("tiledb://") if relative: child_uri = obj.name + self._cached_member_names_to_uris = None # invalidate with self._open("w") as G: retval = G.add(uri=child_uri, relative=relative, name=obj.name) - print("RETVAL ", retval) + #####print("RETVAL ", retval) # See _get_child_uri. Key point is that, on TileDB Cloud, URIs change from pre-creation to # post-creation. Example: # * Upload to pre-creation URI tiledb://namespace/s3://bucket/something/something/somaname @@ -194,12 +191,13 @@ def _add_object(self, obj: TileDBObject, relative: Optional[bool] = None) -> Non # * Member pre-creation URI tiledb://namespace/s3://bucket/something/something/somaname/obs # * Member post-creation URI tiledb://somaname/e4de581a-1353-4150-b1f4-6ed12548e497 obj.uri = self._get_child_uri(obj.name) - print("REMAP", child_uri, "TO", obj.uri) + ####print("REMAP", child_uri, "TO", obj.uri) def _remove_object(self, obj: TileDBObject) -> None: self._remove_object_by_name(obj.name) def _remove_object_by_name(self, member_name: str) -> None: + self._cached_member_names_to_uris = None # invalidate if self.uri.startswith("tiledb://"): mapping = self._get_member_names_to_uris() if member_name not in mapping: @@ -230,8 +228,10 @@ def _get_member_names_to_uris(self) -> Dict[str, str]: Like `_get_member_names()` and `_get_member_uris`, but returns a dict mapping from member name to member URI. """ - with self._open("r") as G: - return {obj.name: obj.uri for obj in G} + if self._cached_member_names_to_uris is None: + with self._open("r") as G: + self._cached_member_names_to_uris = {obj.name: obj.uri for obj in G} + return self._cached_member_names_to_uris def show_metadata(self, recursively: bool = True, indent: str = "") -> None: """