single-cell-data · johnkerl · Jun 1, 2022 · Jun 1, 2022 · aaronwolen · Jun 1, 2022
diff --git a/apis/python/src/tiledbsc/annotation_dataframe.py b/apis/python/src/tiledbsc/annotation_dataframe.py
@@ -85,6 +85,15 @@ def dim_select(self, ids):
         #   >>> A.meta.items()
         #   (('__pandas_index_dims', '{"obs_id": "<U0"}'),)
         # so the set_index is already done for us.
+
+        # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
+        # This is the 'decode on read' part of our logic; in dim_select we have the 'encode on write' part.
+        # Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99.
+        for k in df:
+            dfk = df[k]
+            if len(dfk) > 0 and type(dfk[0]) == bytes:
+                df[k] = dfk.map(lambda e: e.decode())
+
         return df
 
     # ----------------------------------------------------------------
@@ -163,15 +172,38 @@ def from_dataframe(self, dataframe: pd.DataFrame, extent: int) -> None:
             if self._verbose:
                 print(f"{self._indent}Re-using existing array {self.uri}")
 
-        # Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99.
+        # ISSUE:
+        # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB
+        # QueryCondition API. While this needs to be addressed -- global collaborators will want to
+        # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible
+        # to query, we need to store these as ASCII.
+        #
+        # This is (besides collation) a storage-level issue not a presentation-level issue: At write
+        # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since
+        # SOMA is an API: utf8-decode those strings when a query is done & give the user back
+        # "α,β,γ".
+        #
+        # CONTEXT:
+        # https://github.com/single-cell-data/TileDB-SingleCell/issues/99
+        # https://github.com/single-cell-data/TileDB-SingleCell/pull/101
+        # https://github.com/single-cell-data/TileDB-SingleCell/issues/106
+        # https://github.com/single-cell-data/TileDB-SingleCell/pull/117
+        #
+        # IMPLEMENTATION:
+        # Python types -- float, string, what have you -- appear as dtype('O') which is not useful.
+        # Also, `tiledb.from_pandas` has `column_types` but that _forces_ things to string to a
+        # particular if they shouldn't be.
+        #
+        # Instead, we use `dataframe.convert_dtypes` to get a little jump on what `tiledb.from_pandas`
+        # is going to be doing anyway, namely, type-inferring to see what is going to be a string.
+        #
         # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
-        column_types = {}  # XXX None OR {} ?
-        if self.name in self._soma_options.col_names_to_store_as_ascii:
-            col_names_to_store_as_ascii = (
-                self._soma_options.col_names_to_store_as_ascii[self.name]
-            )
-            for col_name in col_names_to_store_as_ascii:
-                column_types[col_name] = np.dtype("S")
+        column_types = {}
+        for column_name in dataframe.keys():
+            dfc = dataframe[column_name]
+            if len(dfc) > 0 and type(dfc[0]) == str:
+                # Force ASCII storage if string, in order to make obs/var columns queryable.
+                column_types[column_name] = np.dtype("S")
 
         tiledb.from_pandas(
             uri=self.uri,

diff --git a/apis/python/src/tiledbsc/soma_options.py b/apis/python/src/tiledbsc/soma_options.py
@@ -1,33 +1,5 @@
 from typing import List, Dict
 
-# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this.
-# Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99.
-default_col_names_to_store_as_ascii = {
-    "obs": [
-        "assay_ontology_term_id",
-        "sex_ontology_term_id",
-        "organism_ontology_term_id",
-        "disease_ontology_term_id",
-        "ethnicity_ontology_term_id",
-        "development_stage_ontology_term_id",
-        "cell_type_ontology_term_id",
-        "tissue_ontology_term_id",
-        "cell_type",
-        "assay",
-        "disease",
-        "organism",
-        "sex",
-        "tissue",
-        "ethnicity",
-        "development_stage",
-    ],
-    "var": [
-        "feature_biotype",
-        "feature_name",
-        "feature_reference",
-    ],
-}
-
 
 class SOMAOptions:
     """
@@ -45,7 +17,6 @@ class SOMAOptions:
     string_dim_zstd_level: int
     write_X_chunked: bool
     goal_chunk_nnz: int
-    col_names_to_store_as_ascii: Dict[str, List[str]]
     member_uris_are_relative: bool
 
     def __init__(
@@ -58,7 +29,6 @@ def __init__(
         string_dim_zstd_level=22,  # https://github.com/single-cell-data/TileDB-SingleCell/issues/27
         write_X_chunked=True,
         goal_chunk_nnz=10000000,
-        col_names_to_store_as_ascii=default_col_names_to_store_as_ascii,
         member_uris_are_relative=None,  # Allows relocatability for local disk / S3, and correct behavior for TileDB Cloud
     ):
         self.obs_extent = obs_extent
@@ -69,5 +39,4 @@ def __init__(
         self.string_dim_zstd_level = string_dim_zstd_level
         self.write_X_chunked = write_X_chunked
         self.goal_chunk_nnz = goal_chunk_nnz
-        self.col_names_to_store_as_ascii = col_names_to_store_as_ascii
         self.member_uris_are_relative = member_uris_are_relative
diff --git a/apis/python/tests/test_type_diversity.py b/apis/python/tests/test_type_diversity.py
@@ -70,11 +70,6 @@ def test_from_anndata_X_type(tmp_path, X_dtype_name, X_encoding):
         assert False  # sanity - test misconfiguration
 
     adata = ad.AnnData(X=X, obs=obs, var=var, dtype=X.dtype)
-    print(
-        " =============================================================>==",
-        adata.X.dtype,
-        X_dtype,
-    )
     assert adata.X.dtype == X_dtype  # sanity
 
     io.from_anndata(SOMA(tmp_path.as_posix()), adata)
@@ -180,6 +175,10 @@ def cmp_dtype(series, tdb: tiledb.Attr) -> bool:
         # TileDB has no object, so assume it will convert to the type underlying the object
         if ad_dtype == np.dtype("O"):
             ad_dtype = np.dtype(type(series[0]))
+            # TODO: see annotation_dataframe.py. Once Unicode attributes are queryable, we'll need
+            # to remove this check which is verifying the current force-to-ASCII workaround.
+            if ad_dtype.name == "str":
+                ad_dtype = np.dtype("S")
         # TileDB has no bool, and automatically converts to uint8
         if ad_dtype == bool:
             ad_dtype = np.uint8

diff --git a/apis/python/tools/peek-ann b/apis/python/tools/peek-ann
@@ -30,6 +30,9 @@ else:
 
 ann = anndata.read_h5ad(input_path)
 
+def decat(ann):
+    return tiledbsc.util_ann._decategoricalize(ann)
+
 # Interact at the prompt now:
 # * ann.X
 # * ann.obs.keys()