From 060a875bb4cbc37d0c4688bf05aaba615cf11021 Mon Sep 17 00:00:00 2001 From: Andrew Tolopko Date: Mon, 23 Jan 2023 16:00:00 -0500 Subject: [PATCH] update to tiledbsoma 0.5.0a6 (#82) - upgrade to tiledbsoma 0.5.0a6 - Unicode now supported in soma.DataFrame columns. - Removed unicode workaround, which forced all unicode values to ASCII. - Added explicit test to ensure continued support of unicode. - Fix AxisQuery import, now that class is exported by tiledbsoma - replace `somacore` imports with `soma` in notebooks --- api/python/cell_census/pyproject.toml | 2 +- .../src/cell_census/get_anndata.py | 6 ++--- .../api_demo/census_compute_over_X.ipynb | 7 +++-- .../notebooks/api_demo/census_datasets.ipynb | 3 +-- .../api_demo/census_high_variable_genes.ipynb | 6 ++--- .../api_demo/census_rank_gene_groups.ipynb | 3 +-- tools/cell_census_builder/census_summary.py | 7 ++--- tools/cell_census_builder/datasets.py | 7 ++--- .../cell_census_builder/experiment_builder.py | 9 +------ tools/cell_census_builder/globals.py | 8 +++--- .../summary_cell_counts.py | 3 --- .../cell_census_builder/tests/test_builder.py | 27 +++++++++++++++++++ tools/cell_census_builder/util.py | 16 ----------- tools/scripts/requirements-dev.txt | 1 + tools/scripts/requirements.txt | 2 +- 15 files changed, 49 insertions(+), 58 deletions(-) create mode 100644 tools/cell_census_builder/tests/test_builder.py diff --git a/api/python/cell_census/pyproject.toml b/api/python/cell_census/pyproject.toml index f4efd4c12..578cce464 100644 --- a/api/python/cell_census/pyproject.toml +++ b/api/python/cell_census/pyproject.toml @@ -31,7 +31,7 @@ dependencies= [ "numpy>=1.21", "requests", "tiledb>=0.19.0", - "tiledbsoma==0.5.0a4", + "tiledbsoma==0.5.0a6", "typing_extensions", "s3fs", "scikit-misc", diff --git a/api/python/cell_census/src/cell_census/get_anndata.py b/api/python/cell_census/src/cell_census/get_anndata.py index 5075d416b..521b68545 100644 --- a/api/python/cell_census/src/cell_census/get_anndata.py +++ b/api/python/cell_census/src/cell_census/get_anndata.py @@ -2,8 +2,6 @@ import anndata import tiledbsoma as soma -# TODO: rm this use use `soma.AxisQuery` after updating to tilebsoma 0.5.0a5 -from somacore import AxisQuery # TODO: rm this import and use `soma.AxisColumnNames` after https://github.com/single-cell-data/TileDB-SOMA/issues/791 from somacore.query.query import AxisColumnNames @@ -56,7 +54,7 @@ def get_anndata( exp = get_experiment(census, organism) with exp.axis_query( measurement_name, - obs_query=AxisQuery(value_filter=obs_value_filter) if obs_value_filter is not None else AxisQuery(), - var_query=AxisQuery(value_filter=var_value_filter) if var_value_filter is not None else AxisQuery(), + obs_query=soma.AxisQuery(value_filter=obs_value_filter) if obs_value_filter is not None else soma.AxisQuery(), + var_query=soma.AxisQuery(value_filter=var_value_filter) if var_value_filter is not None else soma.AxisQuery(), ) as query: return query.to_anndata(X_name=X_name, column_names=column_names) diff --git a/api/python/notebooks/api_demo/census_compute_over_X.ipynb b/api/python/notebooks/api_demo/census_compute_over_X.ipynb index 96fc08f28..d766c290e 100644 --- a/api/python/notebooks/api_demo/census_compute_over_X.ipynb +++ b/api/python/notebooks/api_demo/census_compute_over_X.ipynb @@ -35,7 +35,6 @@ "import cell_census\n", "import tiledbsoma as soma\n", "from tiledbsoma.experiment_query import X_as_series\n", - "import somacore\n", "\n", "census = cell_census.open_soma()\n", "mouse = census[\"census_data\"][\"mus_musculus\"]" @@ -244,7 +243,7 @@ "source": [ "with mouse.axis_query(\n", " measurement_name=\"RNA\",\n", - " obs_query=somacore.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n", ") as query:\n", " var_df = query.var().concat().to_pandas().set_index(\"soma_joinid\")\n", " n_vars = len(var_df)\n", @@ -537,7 +536,7 @@ "\n", "with mouse.axis_query(\n", " measurement_name=\"RNA\",\n", - " obs_query=somacore.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue=='brain' and sex=='male'\"),\n", ") as query:\n", " var_df = query.var().concat().to_pandas().set_index(\"soma_joinid\")\n", " n_vars = len(var_df)\n", @@ -717,7 +716,7 @@ "source": [ "with mouse.axis_query(\n", " measurement_name=\"RNA\",\n", - " obs_query=somacore.AxisQuery(value_filter=\"tissue=='brain'\"),\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue=='brain'\"),\n", ") as query:\n", " obs_df = query.obs(column_names=[\"soma_joinid\", \"dataset_id\"]).concat().to_pandas().set_index(\"soma_joinid\")\n", " var_df = query.var().concat().to_pandas().set_index(\"soma_joinid\")\n", diff --git a/api/python/notebooks/api_demo/census_datasets.ipynb b/api/python/notebooks/api_demo/census_datasets.ipynb index 9134dd234..797255287 100644 --- a/api/python/notebooks/api_demo/census_datasets.ipynb +++ b/api/python/notebooks/api_demo/census_datasets.ipynb @@ -289,7 +289,6 @@ "source": [ "import cell_census\n", "import tiledbsoma as soma\n", - "import somacore\n", "\n", "census = cell_census.open_soma()\n", "census_datasets = census[\"census_info\"][\"datasets\"].read().concat().to_pandas()\n", @@ -490,7 +489,7 @@ "mouse = census[\"census_data\"][\"mus_musculus\"]\n", "with mouse.axis_query(\n", " \"RNA\",\n", - " obs_query=somacore.AxisQuery(value_filter=\"dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'\"),\n", + " obs_query=soma.AxisQuery(value_filter=\"dataset_id == '0bd1a1de-3aee-40e0-b2ec-86c7a30c7149'\"),\n", ") as query:\n", " adata = query.to_anndata(\"raw\")\n", "\n", diff --git a/api/python/notebooks/api_demo/census_high_variable_genes.ipynb b/api/python/notebooks/api_demo/census_high_variable_genes.ipynb index 01d7aa17c..ae70f974c 100644 --- a/api/python/notebooks/api_demo/census_high_variable_genes.ipynb +++ b/api/python/notebooks/api_demo/census_high_variable_genes.ipynb @@ -48,12 +48,12 @@ "source": [ "import numpy as np\n", "import pandas as pd\n", - "import somacore\n", + "import soma\n", "\n", "from cell_census.compute import OnlineMatrixMeanVariance\n", "\n", "\n", - "def highly_variable_genes(query: somacore.ExperimentAxisQuery, n_top_genes: int = 10) -> pd.DataFrame:\n", + "def highly_variable_genes(query: soma.ExperimentAxisQuery, n_top_genes: int = 10) -> pd.DataFrame:\n", " \"\"\"\n", " Acknowledgements: scanpy highly variable genes implementation, github.com/scverse/scanpy\n", " \"\"\"\n", @@ -315,7 +315,7 @@ "source": [ "with human.axis_query(\n", " measurement_name=\"RNA\",\n", - " obs_query=somacore.AxisQuery(value_filter=\"tissue == 'brain'\"),\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue == 'brain'\"),\n", ") as query:\n", " hvg = highly_variable_genes(query)\n", "\n", diff --git a/api/python/notebooks/api_demo/census_rank_gene_groups.ipynb b/api/python/notebooks/api_demo/census_rank_gene_groups.ipynb index fb133c8c3..c125bb073 100644 --- a/api/python/notebooks/api_demo/census_rank_gene_groups.ipynb +++ b/api/python/notebooks/api_demo/census_rank_gene_groups.ipynb @@ -26,7 +26,6 @@ "source": [ "import cell_census\n", "import tiledbsoma as soma\n", - "import somacore\n", "\n", "census = cell_census.open_soma(census_version=\"latest\")\n", "human = census[\"census_data\"][\"homo_sapiens\"]" @@ -67,7 +66,7 @@ "source": [ "with human.axis_query(\n", " \"RNA\",\n", - " obs_query=somacore.AxisQuery(\n", + " obs_query=soma.AxisQuery(\n", " value_filter=\"tissue_ontology_term_id == 'UBERON:0002048' and cell_type_ontology_term_id in ['CL:0000057', 'CL:0000623']\"\n", " ),\n", ") as query:\n", diff --git a/tools/cell_census_builder/census_summary.py b/tools/cell_census_builder/census_summary.py index 3b2a1f04a..dde275319 100644 --- a/tools/cell_census_builder/census_summary.py +++ b/tools/cell_census_builder/census_summary.py @@ -6,8 +6,8 @@ import tiledbsoma as soma from .experiment_builder import ExperimentBuilder, get_summary_stats -from .globals import CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, TileDB_Ctx, SOMA_TileDB_Context -from .util import pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat +from .globals import CENSUS_SCHEMA_VERSION, CENSUS_SUMMARY_NAME, SOMA_TileDB_Context +from .util import uricat def create_census_summary( @@ -28,9 +28,6 @@ def create_census_summary( df = pd.DataFrame.from_records(data, columns=["label", "value"]) df["soma_joinid"] = range(len(df)) - # TODO: work-around for TileDB-SOMA#274. Remove when fixed. - df = pandas_dataframe_strings_to_ascii_issue_247_workaround(df) - # write to a SOMA dataframe summary_uri = uricat(info_collection.uri, CENSUS_SUMMARY_NAME) summary = soma.DataFrame(summary_uri, context=SOMA_TileDB_Context()) diff --git a/tools/cell_census_builder/datasets.py b/tools/cell_census_builder/datasets.py index f3a261ced..403d5677c 100644 --- a/tools/cell_census_builder/datasets.py +++ b/tools/cell_census_builder/datasets.py @@ -6,8 +6,8 @@ import pyarrow as pa import tiledbsoma as soma -from .globals import CENSUS_DATASETS_COLUMNS, CENSUS_DATASETS_NAME, TileDB_Ctx, SOMA_TileDB_Context -from .util import pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat +from .globals import CENSUS_DATASETS_COLUMNS, CENSUS_DATASETS_NAME, SOMA_TileDB_Context +from .util import uricat T = TypeVar("T", bound="Dataset") @@ -71,9 +71,6 @@ def create_dataset_manifest(info_collection: soma.Collection, datasets: List[Dat manifest_df = Dataset.to_dataframe(datasets) manifest_df = manifest_df[CENSUS_DATASETS_COLUMNS + ["soma_joinid"]] - # TODO: work-around for TileDB-SOMA#274. Remove when fixed. - manifest_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(manifest_df) - # write to a SOMA dataframe manifest_uri = uricat(info_collection.uri, CENSUS_DATASETS_NAME) manifest = soma.DataFrame(manifest_uri, context=SOMA_TileDB_Context()) diff --git a/tools/cell_census_builder/experiment_builder.py b/tools/cell_census_builder/experiment_builder.py index a53e7bb4e..578d60b8f 100644 --- a/tools/cell_census_builder/experiment_builder.py +++ b/tools/cell_census_builder/experiment_builder.py @@ -25,7 +25,7 @@ CENSUS_X_LAYERS_PLATFORM_CONFIG, CXG_OBS_TERM_COLUMNS, DONOR_ID_IGNORE, - TileDB_Ctx, FEATURE_DATASET_PRESENCE_MATRIX_NAME, MEASUREMENT_RNA_NAME, SOMA_TileDB_Context, + FEATURE_DATASET_PRESENCE_MATRIX_NAME, MEASUREMENT_RNA_NAME, SOMA_TileDB_Context, ) from .mp import create_process_pool_executor from .source_assets import cat_file @@ -35,7 +35,6 @@ anndata_ordered_bool_issue_853_workaround, array_chunker, is_positive_integral, - pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat, ) @@ -195,9 +194,6 @@ def accumulate_axes(self, dataset: Dataset, ad: anndata.AnnData, progress: Tuple # requires 'organism', do be careful not to delete that. obs_df = ad.obs[list(CXG_OBS_TERM_COLUMNS) + ["organism"]].reset_index(drop=True).copy() - # TODO XXX: Temporary work around pending resolution of TileDB-SOMA#274 - obs_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(obs_df) - obs_df["soma_joinid"] = range(self.n_obs, self.n_obs + len(obs_df)) obs_df["dataset_id"] = dataset.dataset_id @@ -250,9 +246,6 @@ def commit_axis(self) -> None: self.var_df = self.var_df.join(self.gene_feature_length["feature_length"], on="feature_id") self.var_df.feature_length.fillna(0, inplace=True) - # TODO XXX: Temporary work around pending resolution of TileDB-SOMA#274 - self.var_df = pandas_dataframe_strings_to_ascii_issue_247_workaround(self.var_df) - self.var_df = anndata_ordered_bool_issue_853_workaround(self.var_df) se.ms["RNA"].var.write( diff --git a/tools/cell_census_builder/globals.py b/tools/cell_census_builder/globals.py index 86615f102..2771111d4 100644 --- a/tools/cell_census_builder/globals.py +++ b/tools/cell_census_builder/globals.py @@ -2,7 +2,7 @@ import pyarrow as pa import tiledb -import tiledbsoma.options as soma_options +import tiledbsoma as soma CENSUS_SCHEMA_VERSION = "0.1.0" @@ -211,16 +211,16 @@ """ # Global SOMATileDBContext -_SOMA_TileDB_Context: soma_options.SOMATileDBContext = None +_SOMA_TileDB_Context: soma.options.SOMATileDBContext = None # Global TileDB context _TileDB_Ctx: tiledb.Ctx = None -def SOMA_TileDB_Context() -> soma_options.SOMATileDBContext: +def SOMA_TileDB_Context() -> soma.options.SOMATileDBContext: global _SOMA_TileDB_Context if _SOMA_TileDB_Context is None: - _SOMA_TileDB_Context = soma_options.SOMATileDBContext(tiledb_ctx=TileDB_Ctx()) + _SOMA_TileDB_Context = soma.options.SOMATileDBContext(tiledb_ctx=TileDB_Ctx()) return _SOMA_TileDB_Context diff --git a/tools/cell_census_builder/summary_cell_counts.py b/tools/cell_census_builder/summary_cell_counts.py index c11dcb342..b51163cde 100644 --- a/tools/cell_census_builder/summary_cell_counts.py +++ b/tools/cell_census_builder/summary_cell_counts.py @@ -9,7 +9,6 @@ from .globals import CENSUS_SUMMARY_CELL_COUNTS_COLUMNS, CENSUS_SUMMARY_CELL_COUNTS_NAME, SOMA_TileDB_Context from .util import ( anndata_ordered_bool_issue_853_workaround, - pandas_dataframe_strings_to_ascii_issue_247_workaround, uricat, ) @@ -29,8 +28,6 @@ def create_census_summary_cell_counts( ) df["soma_joinid"] = df.index.astype(np.int64) - # TODO: work-around for TileDB-SOMA#274. Remove when fixed. - df = pandas_dataframe_strings_to_ascii_issue_247_workaround(df) df = anndata_ordered_bool_issue_853_workaround(df) # write to a SOMA dataframe diff --git a/tools/cell_census_builder/tests/test_builder.py b/tools/cell_census_builder/tests/test_builder.py new file mode 100644 index 000000000..47285dab9 --- /dev/null +++ b/tools/cell_census_builder/tests/test_builder.py @@ -0,0 +1,27 @@ +# General unit tests for cell_census_builder. Intention is to add more fine-grained tests for builder. +import os +from tempfile import mkstemp, TemporaryDirectory + +import pandas as pd +import pyarrow as pa +import tiledbsoma as soma + + +def test_unicode_support() -> None: + """ + Regression test that unicode is supported correctly in tiledbsoma. + This test is not strictly necessary, but it validates the requirements that Cell Census + support unicode in DataFrame columns. + """ + with TemporaryDirectory() as d: + pd_df = pd.DataFrame(data={'value': ["Ünicode", "S̈upport"]}, columns=['value']) + pd_df['soma_joinid'] = pd_df.index + s_df = soma.DataFrame(uri=os.path.join(d, "unicode_support")).\ + create(pa.Schema.from_pandas(pd_df, preserve_index=False), index_column_names=['soma_joinid']) + s_df.write(pa.Table.from_pandas(pd_df, preserve_index=False)) + + pd_df_in = soma.DataFrame(uri=os.path.join(d, "unicode_support")).read().concat().to_pandas() + + assert pd_df_in['value'].to_list() == ["Ünicode", "S̈upport"] + + diff --git a/tools/cell_census_builder/util.py b/tools/cell_census_builder/util.py index 678b2a0e8..1b8a9b945 100644 --- a/tools/cell_census_builder/util.py +++ b/tools/cell_census_builder/util.py @@ -83,22 +83,6 @@ def is_positive_integral(X: Union[npt.NDArray[np.floating[Any]], sparse.spmatrix return True -def pandas_dataframe_strings_to_ascii_issue_247_workaround(df: pd.DataFrame) -> pd.DataFrame: - """ - TileDB-SOMA _temporarily_ supports only ASCII in strings. - This code will convert all strings in a dataframe to ascii. - Remove this code when TileDB-SOMA#274 is resolved. - """ - import unicodedata - - warn("Converting dataframe strings to ASCII as temporary work-around for TileDB-SOMA#274.") - for k in df: - if df[k].dtype == object: - df[k] = df[k].map(lambda val: unicodedata.normalize("NFKD", val).encode("ascii", "ignore").decode()) - - return df - - def anndata_ordered_bool_issue_853_workaround(df: pd.DataFrame) -> pd.DataFrame: # """ # TileDB-SOMA does not support creating dataframe with categorical / dictionary diff --git a/tools/scripts/requirements-dev.txt b/tools/scripts/requirements-dev.txt index bd39f2377..3b0a48ed0 100644 --- a/tools/scripts/requirements-dev.txt +++ b/tools/scripts/requirements-dev.txt @@ -4,4 +4,5 @@ isort flake8-bugbear mypy==0.982 numpy +pytest types-requests diff --git a/tools/scripts/requirements.txt b/tools/scripts/requirements.txt index bf3689abf..e98fdec44 100644 --- a/tools/scripts/requirements.txt +++ b/tools/scripts/requirements.txt @@ -4,7 +4,7 @@ anndata numpy tiledb # NOTE: You can also build this dependency from source, per ./notebooks/README.md. -tiledbsoma==0.5.0a4 +tiledbsoma==0.5.0a6 scipy fsspec s3fs