diff --git a/apis/python/src/tiledbsc/annotation_dataframe.py b/apis/python/src/tiledbsc/annotation_dataframe.py index a560610036..8bcd52a4d7 100644 --- a/apis/python/src/tiledbsc/annotation_dataframe.py +++ b/apis/python/src/tiledbsc/annotation_dataframe.py @@ -85,6 +85,15 @@ def dim_select(self, ids): # >>> A.meta.items() # (('__pandas_index_dims', '{"obs_id": " 0 and type(dfk[0]) == bytes: + df[k] = dfk.map(lambda e: e.decode()) + return df # ---------------------------------------------------------------- @@ -163,15 +172,38 @@ def from_dataframe(self, dataframe: pd.DataFrame, extent: int) -> None: if self._verbose: print(f"{self._indent}Re-using existing array {self.uri}") - # Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99. + # ISSUE: + # TileDB attributes can be stored as Unicode but they are not yet queryable via the TileDB + # QueryCondition API. While this needs to be addressed -- global collaborators will want to + # write annotation-dataframe values in Unicode -- until then, to make obs/var data possible + # to query, we need to store these as ASCII. + # + # This is (besides collation) a storage-level issue not a presentation-level issue: At write + # time, this works — "α,β,γ" stores as "\xce\xb1,\xce\xb2,\xce\xb3"; at read time: since + # SOMA is an API: utf8-decode those strings when a query is done & give the user back + # "α,β,γ". + # + # CONTEXT: + # https://github.com/single-cell-data/TileDB-SingleCell/issues/99 + # https://github.com/single-cell-data/TileDB-SingleCell/pull/101 + # https://github.com/single-cell-data/TileDB-SingleCell/issues/106 + # https://github.com/single-cell-data/TileDB-SingleCell/pull/117 + # + # IMPLEMENTATION: + # Python types -- float, string, what have you -- appear as dtype('O') which is not useful. + # Also, `tiledb.from_pandas` has `column_types` but that _forces_ things to string to a + # particular if they shouldn't be. + # + # Instead, we use `dataframe.convert_dtypes` to get a little jump on what `tiledb.from_pandas` + # is going to be doing anyway, namely, type-inferring to see what is going to be a string. + # # TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this. - column_types = {} # XXX None OR {} ? - if self.name in self._soma_options.col_names_to_store_as_ascii: - col_names_to_store_as_ascii = ( - self._soma_options.col_names_to_store_as_ascii[self.name] - ) - for col_name in col_names_to_store_as_ascii: - column_types[col_name] = np.dtype("S") + column_types = {} + for column_name in dataframe.keys(): + dfc = dataframe[column_name] + if len(dfc) > 0 and type(dfc[0]) == str: + # Force ASCII storage if string, in order to make obs/var columns queryable. + column_types[column_name] = np.dtype("S") tiledb.from_pandas( uri=self.uri, diff --git a/apis/python/src/tiledbsc/soma_options.py b/apis/python/src/tiledbsc/soma_options.py index 45df35ae4a..077d108a57 100644 --- a/apis/python/src/tiledbsc/soma_options.py +++ b/apis/python/src/tiledbsc/soma_options.py @@ -1,33 +1,5 @@ from typing import List, Dict -# TODO: when UTF-8 attributes are queryable using TileDB-Py's QueryCondition API we can remove this. -# Context: https://github.com/single-cell-data/TileDB-SingleCell/issues/99. -default_col_names_to_store_as_ascii = { - "obs": [ - "assay_ontology_term_id", - "sex_ontology_term_id", - "organism_ontology_term_id", - "disease_ontology_term_id", - "ethnicity_ontology_term_id", - "development_stage_ontology_term_id", - "cell_type_ontology_term_id", - "tissue_ontology_term_id", - "cell_type", - "assay", - "disease", - "organism", - "sex", - "tissue", - "ethnicity", - "development_stage", - ], - "var": [ - "feature_biotype", - "feature_name", - "feature_reference", - ], -} - class SOMAOptions: """ @@ -45,7 +17,6 @@ class SOMAOptions: string_dim_zstd_level: int write_X_chunked: bool goal_chunk_nnz: int - col_names_to_store_as_ascii: Dict[str, List[str]] member_uris_are_relative: bool def __init__( @@ -58,7 +29,6 @@ def __init__( string_dim_zstd_level=22, # https://github.com/single-cell-data/TileDB-SingleCell/issues/27 write_X_chunked=True, goal_chunk_nnz=10000000, - col_names_to_store_as_ascii=default_col_names_to_store_as_ascii, member_uris_are_relative=None, # Allows relocatability for local disk / S3, and correct behavior for TileDB Cloud ): self.obs_extent = obs_extent @@ -69,5 +39,4 @@ def __init__( self.string_dim_zstd_level = string_dim_zstd_level self.write_X_chunked = write_X_chunked self.goal_chunk_nnz = goal_chunk_nnz - self.col_names_to_store_as_ascii = col_names_to_store_as_ascii self.member_uris_are_relative = member_uris_are_relative diff --git a/apis/python/tests/test_type_diversity.py b/apis/python/tests/test_type_diversity.py index 26201ffd6c..6e84e59216 100644 --- a/apis/python/tests/test_type_diversity.py +++ b/apis/python/tests/test_type_diversity.py @@ -70,11 +70,6 @@ def test_from_anndata_X_type(tmp_path, X_dtype_name, X_encoding): assert False # sanity - test misconfiguration adata = ad.AnnData(X=X, obs=obs, var=var, dtype=X.dtype) - print( - " =============================================================>==", - adata.X.dtype, - X_dtype, - ) assert adata.X.dtype == X_dtype # sanity io.from_anndata(SOMA(tmp_path.as_posix()), adata) @@ -180,6 +175,10 @@ def cmp_dtype(series, tdb: tiledb.Attr) -> bool: # TileDB has no object, so assume it will convert to the type underlying the object if ad_dtype == np.dtype("O"): ad_dtype = np.dtype(type(series[0])) + # TODO: see annotation_dataframe.py. Once Unicode attributes are queryable, we'll need + # to remove this check which is verifying the current force-to-ASCII workaround. + if ad_dtype.name == "str": + ad_dtype = np.dtype("S") # TileDB has no bool, and automatically converts to uint8 if ad_dtype == bool: ad_dtype = np.uint8 diff --git a/apis/python/tools/peek-ann b/apis/python/tools/peek-ann index 2924a69ad2..aa9aaabb3f 100755 --- a/apis/python/tools/peek-ann +++ b/apis/python/tools/peek-ann @@ -30,6 +30,9 @@ else: ann = anndata.read_h5ad(input_path) +def decat(ann): + return tiledbsc.util_ann._decategoricalize(ann) + # Interact at the prompt now: # * ann.X # * ann.obs.keys()