Skip to content

Commit

Permalink
update to tiledbsoma 0.5.0a6 (#82)
Browse files Browse the repository at this point in the history
- upgrade to tiledbsoma 0.5.0a6
- Unicode now supported in soma.DataFrame columns.
- Removed unicode workaround, which forced all unicode values to ASCII.
- Added explicit test to ensure continued support of unicode.
- Fix AxisQuery import, now that class is exported by tiledbsoma
- replace `somacore` imports with `soma` in notebooks
  • Loading branch information
atolopko-czi authored Jan 23, 2023
1 parent 4d8b955 commit 060a875
Show file tree
Hide file tree
Showing 15 changed files with 49 additions and 58 deletions.
2 changes: 1 addition & 1 deletion api/python/cell_census/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dependencies= [
"numpy>=1.21",
"requests",
"tiledb>=0.19.0",
"tiledbsoma==0.5.0a4",
"tiledbsoma==0.5.0a6",
"typing_extensions",
"s3fs",
"scikit-misc",
Expand Down
6 changes: 2 additions & 4 deletions api/python/cell_census/src/cell_census/get_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import anndata
import tiledbsoma as soma
# TODO: rm this use use `soma.AxisQuery` after updating to tilebsoma 0.5.0a5
from somacore import AxisQuery
# TODO: rm this import and use `soma.AxisColumnNames` after https://github.com/single-cell-data/TileDB-SOMA/issues/791
from somacore.query.query import AxisColumnNames

Expand Down Expand Up @@ -56,7 +54,7 @@ def get_anndata(
exp = get_experiment(census, organism)
with exp.axis_query(
measurement_name,
obs_query=AxisQuery(value_filter=obs_value_filter) if obs_value_filter is not None else AxisQuery(),
var_query=AxisQuery(value_filter=var_value_filter) if var_value_filter is not None else AxisQuery(),
obs_query=soma.AxisQuery(value_filter=obs_value_filter) if obs_value_filter is not None else soma.AxisQuery(),
var_query=soma.AxisQuery(value_filter=var_value_filter) if var_value_filter is not None else soma.AxisQuery(),
) as query:
return query.to_anndata(X_name=X_name, column_names=column_names)
7 changes: 3 additions & 4 deletions api/python/notebooks/api_demo/census_compute_over_X.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
"import cell_census\n",
"import tiledbsoma as soma\n",
"from tiledbsoma.experiment_query import X_as_series\n",
"import somacore\n",
"\n",
"census = cell_census.open_soma()\n",
"mouse = census[\"census_data\"][\"mus_musculus\"]"
Expand Down Expand Up @@ -244,7 +243,7 @@
"source": [
"with mouse.axis_query(\n",
" measurement_name=\"RNA\",\n",
" obs_query=somacore.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n",
" obs_query=soma.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n",
") as query:\n",
" var_df = query.var().concat().to_pandas().set_index(\"soma_joinid\")\n",
" n_vars = len(var_df)\n",
Expand Down Expand Up @@ -537,7 +536,7 @@
"\n",
"with mouse.axis_query(\n",
" measurement_name=\"RNA\",\n",
" obs_query=somacore.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n",
" obs_query=soma.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n",
") as query:\n",
" var_df = query.var().concat().to_pandas().set_index(\"soma_joinid\")\n",
" n_vars = len(var_df)\n",
Expand Down Expand Up @@ -717,7 +716,7 @@
"source": [
"with mouse.axis_query(\n",
" measurement_name=\"RNA\",\n",
" obs_query=somacore.AxisQuery(value_filter=\"tissue=='brain'\"),\n",
" obs_query=soma.AxisQuery(value_filter=\"tissue=='brain'\"),\n",
") as query:\n",
" obs_df = query.obs(column_names=[\"soma_joinid\", \"dataset_id\"]).concat().to_pandas().set_index(\"soma_joinid\")\n",
" var_df = query.var().concat().to_pandas().set_index(\"soma_joinid\")\n",
Expand Down
3 changes: 1 addition & 2 deletions api/python/notebooks/api_demo/census_datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@
"source": [
"import cell_census\n",
"import tiledbsoma as soma\n",
"import somacore\n",
"\n",
"census = cell_census.open_soma()\n",
"census_datasets = census[\"census_info\"][\"datasets\"].read().concat().to_pandas()\n",
Expand Down Expand Up @@ -490,7 +489,7 @@
"mouse = census[\"census_data\"][\"mus_musculus\"]\n",
"with mouse.axis_query(\n",
" \"RNA\",\n",
" obs_query=somacore.AxisQuery(value_filter=\"dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'\"),\n",
" obs_query=soma.AxisQuery(value_filter=\"dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'\"),\n",
") as query:\n",
" adata = query.to_anndata(\"raw\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import somacore\n",
"import soma\n",
"\n",
"from cell_census.compute import OnlineMatrixMeanVariance\n",
"\n",
"\n",
"def highly_variable_genes(query: somacore.ExperimentAxisQuery, n_top_genes: int = 10) -> pd.DataFrame:\n",
"def highly_variable_genes(query: soma.ExperimentAxisQuery, n_top_genes: int = 10) -> pd.DataFrame:\n",
" \"\"\"\n",
" Acknowledgements: scanpy highly variable genes implementation, github.com/scverse/scanpy\n",
" \"\"\"\n",
Expand Down Expand Up @@ -315,7 +315,7 @@
"source": [
"with human.axis_query(\n",
" measurement_name=\"RNA\",\n",
" obs_query=somacore.AxisQuery(value_filter=\"tissue == 'brain'\"),\n",
" obs_query=soma.AxisQuery(value_filter=\"tissue == 'brain'\"),\n",
") as query:\n",
" hvg = highly_variable_genes(query)\n",
"\n",
Expand Down
3 changes: 1 addition & 2 deletions api/python/notebooks/api_demo/census_rank_gene_groups.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
"source": [
"import cell_census\n",
"import tiledbsoma as soma\n",
"import somacore\n",
"\n",
"census = cell_census.open_soma(census_version=\"latest\")\n",
"human = census[\"census_data\"][\"homo_sapiens\"]"
Expand Down Expand Up @@ -67,7 +66,7 @@
"source": [
"with human.axis_query(\n",
" \"RNA\",\n",
" obs_query=somacore.AxisQuery(\n",
" obs_query=soma.AxisQuery(\n",
" value_filter=\"tissue_ontology_term_id == 'UBERON:0002048' and cell_type_ontology_term_id in ['CL:0000057', 'CL:0000623']\"\n",
" ),\n",
") as query:\n",
Expand Down
7 changes: 2 additions & 5 deletions tools/cell_census_builder/census_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import tiledbsoma as soma

from .experiment_builder import ExperimentBuilder, get_summary_stats
from .globals import CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, TileDB_Ctx, SOMA_TileDB_Context
from .util import pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat
from .globals import CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, SOMA_TileDB_Context
from .util import uricat


def create_census_summary(
Expand All @@ -28,9 +28,6 @@ def create_census_summary(
df = pd.DataFrame.from_records(data, columns=["label", "value"])
df["soma_joinid"] = range(len(df))

# TODO: work-around for TileDB-SOMA#274. Remove when fixed.
df = pandas_dataframe_strings_to_ascii_issue_247_workaround(df)

# write to a SOMA dataframe
summary_uri = uricat(info_collection.uri, CENSUS_SUMMARY_NAME)
summary = soma.DataFrame(summary_uri, context=SOMA_TileDB_Context())
Expand Down
7 changes: 2 additions & 5 deletions tools/cell_census_builder/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import pyarrow as pa
import tiledbsoma as soma

from .globals import CENSUS_DATASETS_COLUMNS, CENSUS_DATASETS_NAME, TileDB_Ctx, SOMA_TileDB_Context
from .util import pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat
from .globals import CENSUS_DATASETS_COLUMNS, CENSUS_DATASETS_NAME, SOMA_TileDB_Context
from .util import uricat

T = TypeVar("T", bound="Dataset")

Expand Down Expand Up @@ -71,9 +71,6 @@ def create_dataset_manifest(info_collection: soma.Collection, datasets: List[Dat
manifest_df = Dataset.to_dataframe(datasets)
manifest_df = manifest_df[CENSUS_DATASETS_COLUMNS + ["soma_joinid"]]

# TODO: work-around for TileDB-SOMA#274. Remove when fixed.
manifest_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(manifest_df)

# write to a SOMA dataframe
manifest_uri = uricat(info_collection.uri, CENSUS_DATASETS_NAME)
manifest = soma.DataFrame(manifest_uri, context=SOMA_TileDB_Context())
Expand Down
9 changes: 1 addition & 8 deletions tools/cell_census_builder/experiment_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
CENSUS_X_LAYERS_PLATFORM_CONFIG,
CXG_OBS_TERM_COLUMNS,
DONOR_ID_IGNORE,
TileDB_Ctx, FEATURE_DATASET_PRESENCE_MATRIX_NAME, MEASUREMENT_RNA_NAME, SOMA_TileDB_Context,
FEATURE_DATASET_PRESENCE_MATRIX_NAME, MEASUREMENT_RNA_NAME, SOMA_TileDB_Context,
)
from .mp import create_process_pool_executor
from .source_assets import cat_file
Expand All @@ -35,7 +35,6 @@
anndata_ordered_bool_issue_853_workaround,
array_chunker,
is_positive_integral,
pandas_dataframe_strings_to_ascii_issue_247_workaround,
uricat,
)

Expand Down Expand Up @@ -195,9 +194,6 @@ def accumulate_axes(self, dataset: Dataset, ad: anndata.AnnData, progress: Tuple
# requires 'organism', do be careful not to delete that.
obs_df = ad.obs[list(CXG_OBS_TERM_COLUMNS) + ["organism"]].reset_index(drop=True).copy()

# TODO XXX: Temporary work around pending resolution of TileDB-SOMA#274
obs_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(obs_df)

obs_df["soma_joinid"] = range(self.n_obs, self.n_obs + len(obs_df))
obs_df["dataset_id"] = dataset.dataset_id

Expand Down Expand Up @@ -250,9 +246,6 @@ def commit_axis(self) -> None:
self.var_df = self.var_df.join(self.gene_feature_length["feature_length"], on="feature_id")
self.var_df.feature_length.fillna(0, inplace=True)

# TODO XXX: Temporary work around pending resolution of TileDB-SOMA#274
self.var_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(self.var_df)

self.var_df = anndata_ordered_bool_issue_853_workaround(self.var_df)

se.ms["RNA"].var.write(
Expand Down
8 changes: 4 additions & 4 deletions tools/cell_census_builder/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pyarrow as pa
import tiledb
import tiledbsoma.options as soma_options
import tiledbsoma as soma

CENSUS_SCHEMA_VERSION = "0.1.0"

Expand Down Expand Up @@ -211,16 +211,16 @@
"""

# Global SOMATileDBContext
_SOMA_TileDB_Context: soma_options.SOMATileDBContext = None
_SOMA_TileDB_Context: soma.options.SOMATileDBContext = None

# Global TileDB context
_TileDB_Ctx: tiledb.Ctx = None


def SOMA_TileDB_Context() -> soma_options.SOMATileDBContext:
def SOMA_TileDB_Context() -> soma.options.SOMATileDBContext:
global _SOMA_TileDB_Context
if _SOMA_TileDB_Context is None:
_SOMA_TileDB_Context = soma_options.SOMATileDBContext(tiledb_ctx=TileDB_Ctx())
_SOMA_TileDB_Context = soma.options.SOMATileDBContext(tiledb_ctx=TileDB_Ctx())
return _SOMA_TileDB_Context


Expand Down
3 changes: 0 additions & 3 deletions tools/cell_census_builder/summary_cell_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from .globals import CENSUS_SUMMARY_CELL_COUNTS_COLUMNS, CENSUS_SUMMARY_CELL_COUNTS_NAME, SOMA_TileDB_Context
from .util import (
anndata_ordered_bool_issue_853_workaround,
pandas_dataframe_strings_to_ascii_issue_247_workaround,
uricat,
)

Expand All @@ -29,8 +28,6 @@ def create_census_summary_cell_counts(
)
df["soma_joinid"] = df.index.astype(np.int64)

# TODO: work-around for TileDB-SOMA#274. Remove when fixed.
df = pandas_dataframe_strings_to_ascii_issue_247_workaround(df)
df = anndata_ordered_bool_issue_853_workaround(df)

# write to a SOMA dataframe
Expand Down
27 changes: 27 additions & 0 deletions tools/cell_census_builder/tests/test_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# General unit tests for cell_census_builder. Intention is to add more fine-grained tests for builder.
import os
from tempfile import mkstemp, TemporaryDirectory

import pandas as pd
import pyarrow as pa
import tiledbsoma as soma


def test_unicode_support() -> None:
"""
Regression test that unicode is supported correctly in tiledbsoma.
This test is not strictly necessary, but it validates the requirements that Cell Census
support unicode in DataFrame columns.
"""
with TemporaryDirectory() as d:
pd_df = pd.DataFrame(data={'value': ["Ünicode", "S̈upport"]}, columns=['value'])
pd_df['soma_joinid'] = pd_df.index
s_df = soma.DataFrame(uri=os.path.join(d, "unicode_support")).\
create(pa.Schema.from_pandas(pd_df, preserve_index=False), index_column_names=['soma_joinid'])
s_df.write(pa.Table.from_pandas(pd_df, preserve_index=False))

pd_df_in = soma.DataFrame(uri=os.path.join(d, "unicode_support")).read().concat().to_pandas()

assert pd_df_in['value'].to_list() == ["Ünicode", "S̈upport"]


16 changes: 0 additions & 16 deletions tools/cell_census_builder/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,22 +83,6 @@ def is_positive_integral(X: Union[npt.NDArray[np.floating[Any]], sparse.spmatrix
return True


def pandas_dataframe_strings_to_ascii_issue_247_workaround(df: pd.DataFrame) -> pd.DataFrame:
"""
TileDB-SOMA _temporarily_ supports only ASCII in strings.
This code will convert all strings in a dataframe to ascii.
Remove this code when TileDB-SOMA#274 is resolved.
"""
import unicodedata

warn("Converting dataframe strings to ASCII as temporary work-around for TileDB-SOMA#274.")
for k in df:
if df[k].dtype == object:
df[k] = df[k].map(lambda val: unicodedata.normalize("NFKD", val).encode("ascii", "ignore").decode())

return df


def anndata_ordered_bool_issue_853_workaround(df: pd.DataFrame) -> pd.DataFrame:
# """
# TileDB-SOMA does not support creating dataframe with categorical / dictionary
Expand Down
1 change: 1 addition & 0 deletions tools/scripts/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ isort
flake8-bugbear
mypy==0.982
numpy
pytest
types-requests
2 changes: 1 addition & 1 deletion tools/scripts/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ anndata
numpy
tiledb
# NOTE: You can also build this dependency from source, per ./notebooks/README.md.
tiledbsoma==0.5.0a4
tiledbsoma==0.5.0a6
scipy
fsspec
s3fs
Expand Down

0 comments on commit 060a875

Please sign in to comment.