diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646982378_1730646982378_3d585af40365b4e37b8f8ba3eb5d397f b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646982378_1730646982378_3d585af40365b4e37b8f8ba3eb5d397f new file mode 100644 index 0000000000..a1ca32c564 Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646982378_1730646982378_3d585af40365b4e37b8f8ba3eb5d397f differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646982386_1730646982386_30f8bc3e229e6b2a077df9af392f5b75 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646982386_1730646982386_30f8bc3e229e6b2a077df9af392f5b75 new file mode 100644 index 0000000000..4527feb154 Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646982386_1730646982386_30f8bc3e229e6b2a077df9af392f5b75 differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646982394_1730646982394_14af812a1f2bc292bf6265be3d5c6a46 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646982394_1730646982394_14af812a1f2bc292bf6265be3d5c6a46 new file mode 100644 index 0000000000..a1ca32c564 Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646982394_1730646982394_14af812a1f2bc292bf6265be3d5c6a46 differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646982400_1730646982400_6f65cb3d6c3f58482216217921b61181 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646982400_1730646982400_6f65cb3d6c3f58482216217921b61181 new file mode 100644 index 0000000000..a1ca32c564 Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646982400_1730646982400_6f65cb3d6c3f58482216217921b61181 differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646982407_1730646982407_73bf9a9a85a4bab5e0243fb2bfe3ef04 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646982407_1730646982407_73bf9a9a85a4bab5e0243fb2bfe3ef04 new file mode 100644 index 0000000000..cc39a7493b Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646982407_1730646982407_73bf9a9a85a4bab5e0243fb2bfe3ef04 differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646982412_1730646982412_3be9315f3bf7a6b41fdc64385428b0c9 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646982412_1730646982412_3be9315f3bf7a6b41fdc64385428b0c9 new file mode 100644 index 0000000000..cc39a7493b Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646982412_1730646982412_3be9315f3bf7a6b41fdc64385428b0c9 differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/var/__schema/__1730645864313_1730645864313_7e3643f24b8a0bd62c5bc3826383b202 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/var/__schema/__1730645864313_1730645864313_7e3643f24b8a0bd62c5bc3826383b202 new file mode 100644 index 0000000000..35af3128b1 Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/var/__schema/__1730645864313_1730645864313_7e3643f24b8a0bd62c5bc3826383b202 differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646982418_1730646982418_523922bbaad0be888311e0c354db3b53 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646982418_1730646982418_523922bbaad0be888311e0c354db3b53 new file mode 100644 index 0000000000..3939bd284a Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646982418_1730646982418_523922bbaad0be888311e0c354db3b53 differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/raw/X/data/__schema/__1730646982429_1730646982429_40368fa8c7242d638bcbbb1bca05ec1f b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/X/data/__schema/__1730646982429_1730646982429_40368fa8c7242d638bcbbb1bca05ec1f new file mode 100644 index 0000000000..cd0a921f44 Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/X/data/__schema/__1730646982429_1730646982429_40368fa8c7242d638bcbbb1bca05ec1f differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/raw/var/__schema/__1730646982424_1730646982424_326d997a982823e2faacaaf703a9eb1e b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/var/__schema/__1730646982424_1730646982424_326d997a982823e2faacaaf703a9eb1e new file mode 100644 index 0000000000..aa919d163d Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/var/__schema/__1730646982424_1730646982424_326d997a982823e2faacaaf703a9eb1e differ diff --git a/apis/python/notebooks/data/dense/pbmc3k/obs/__schema/__1730645864297_1730645864297_59235d8e66670248696b378ddf914c84 b/apis/python/notebooks/data/dense/pbmc3k/obs/__schema/__1730645864297_1730645864297_59235d8e66670248696b378ddf914c84 new file mode 100644 index 0000000000..e3350785db Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/obs/__schema/__1730645864297_1730645864297_59235d8e66670248696b378ddf914c84 differ diff --git a/apis/python/notebooks/data/pbmc3k_raw.h5ad b/apis/python/notebooks/data/pbmc3k_raw.h5ad new file mode 100644 index 0000000000..460ee0b60a Binary files /dev/null and b/apis/python/notebooks/data/pbmc3k_raw.h5ad differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/X/data/__schema/__1730646967626_1730646967626_67119b59c3aa2e4b2f5285eb14924fb5 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/X/data/__schema/__1730646967626_1730646967626_67119b59c3aa2e4b2f5285eb14924fb5 new file mode 100644 index 0000000000..1fa7fd9632 Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/X/data/__schema/__1730646967626_1730646967626_67119b59c3aa2e4b2f5285eb14924fb5 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646967637_1730646967637_3f0aa65f7ea0ca84eb931e62a3f066d2 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646967637_1730646967637_3f0aa65f7ea0ca84eb931e62a3f066d2 new file mode 100644 index 0000000000..a1ca32c564 Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646967637_1730646967637_3f0aa65f7ea0ca84eb931e62a3f066d2 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646967644_1730646967644_187ad8169f9d5226654947d0afc0cc55 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646967644_1730646967644_187ad8169f9d5226654947d0afc0cc55 new file mode 100644 index 0000000000..4527feb154 Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646967644_1730646967644_187ad8169f9d5226654947d0afc0cc55 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646967650_1730646967650_45a6de13169ab7508777260a4e4ded09 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646967650_1730646967650_45a6de13169ab7508777260a4e4ded09 new file mode 100644 index 0000000000..a1ca32c564 Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646967650_1730646967650_45a6de13169ab7508777260a4e4ded09 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646967656_1730646967656_73cc8ae0b6ae7072842d72a4b2b33352 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646967656_1730646967656_73cc8ae0b6ae7072842d72a4b2b33352 new file mode 100644 index 0000000000..a1ca32c564 Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646967656_1730646967656_73cc8ae0b6ae7072842d72a4b2b33352 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646967663_1730646967663_3d149150f3b7c652f8f91820c0e5690b b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646967663_1730646967663_3d149150f3b7c652f8f91820c0e5690b new file mode 100644 index 0000000000..cc39a7493b Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646967663_1730646967663_3d149150f3b7c652f8f91820c0e5690b differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646967668_1730646967668_28f39375ad27b80e5c591ad3521afda1 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646967668_1730646967668_28f39375ad27b80e5c591ad3521afda1 new file mode 100644 index 0000000000..cc39a7493b Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646967668_1730646967668_28f39375ad27b80e5c591ad3521afda1 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/var/__schema/__1730645851538_1730645851538_737b2e86bb24ab8b3d9ef856a43ec743 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/var/__schema/__1730645851538_1730645851538_737b2e86bb24ab8b3d9ef856a43ec743 new file mode 100644 index 0000000000..35af3128b1 Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/var/__schema/__1730645851538_1730645851538_737b2e86bb24ab8b3d9ef856a43ec743 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646967674_1730646967674_14c5d26b3736039bce3a5b9632724554 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646967674_1730646967674_14c5d26b3736039bce3a5b9632724554 new file mode 100644 index 0000000000..3939bd284a Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646967674_1730646967674_14c5d26b3736039bce3a5b9632724554 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/X/data/__schema/__1730646967685_1730646967685_7cf688bc654e41169a4b4040d3518a83 b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/X/data/__schema/__1730646967685_1730646967685_7cf688bc654e41169a4b4040d3518a83 new file mode 100644 index 0000000000..cd0a921f44 Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/X/data/__schema/__1730646967685_1730646967685_7cf688bc654e41169a4b4040d3518a83 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/var/__schema/__1730646967680_1730646967680_606ea9e41f3ae67cdb16e9a4cdfc8f67 b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/var/__schema/__1730646967680_1730646967680_606ea9e41f3ae67cdb16e9a4cdfc8f67 new file mode 100644 index 0000000000..aa919d163d Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/var/__schema/__1730646967680_1730646967680_606ea9e41f3ae67cdb16e9a4cdfc8f67 differ diff --git a/apis/python/notebooks/data/sparse/pbmc3k/obs/__schema/__1730645851522_1730645851522_6624ac044dc93b8508f24f435c18e9e3 b/apis/python/notebooks/data/sparse/pbmc3k/obs/__schema/__1730645851522_1730645851522_6624ac044dc93b8508f24f435c18e9e3 new file mode 100644 index 0000000000..e3350785db Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/obs/__schema/__1730645851522_1730645851522_6624ac044dc93b8508f24f435c18e9e3 differ diff --git a/apis/python/notebooks/tutorial_exp_query.ipynb b/apis/python/notebooks/tutorial_exp_query.ipynb index 26f1d1d5e7..5dc8804677 100644 --- a/apis/python/notebooks/tutorial_exp_query.ipynb +++ b/apis/python/notebooks/tutorial_exp_query.ipynb @@ -1,296 +1,532 @@ { - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.9.15", - "mimetype": "text/x-python", - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "pygments_lexer": "ipython3", - "nbconvert_exporter": "python", - "file_extension": ".py" - } + "cells": [ + { + "cell_type": "markdown", + "id": "2b8e72a7-129c-422c-b955-350fb9ee0541", + "metadata": { + "tags": [] + }, + "source": [ + "# Tutorial: SOMA Experiment queries" + ] }, - "nbformat_minor": 5, - "nbformat": 4, - "cells": [ - { - "cell_type": "markdown", - "source": "# Tutorial: SOMA Experiment queries", - "metadata": { - "tags": [] - }, - "id": "2b8e72a7-129c-422c-b955-350fb9ee0541" - }, - { - "cell_type": "code", - "source": "import tiledbsoma as soma", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 3, - "outputs": [], - "id": "3a5fd5d3" - }, - { - "cell_type": "markdown", - "source": "In this notebook, we'll take a quick look at the SOMA experiment-query API. The dataset used is from Peripheral Blood Mononuclear Cells (PBMC), which is freely available from 10X Genomics.\n", - "metadata": { - "tags": [] - }, - "id": "ccc8709a" - }, - { - "cell_type": "code", - "source": "exp = soma.Experiment.open('data/sparse/pbmc3k')", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 4, - "outputs": [], - "id": "9b8851d9-27f1-437b-a070-b41a65a5609e" - }, - { - "cell_type": "markdown", - "source": "Using the keys of the `obs` dataframe, we can see what fields are available to query on.", - "metadata": { - "tags": [] - }, - "id": "fab7898c" - }, - { - "cell_type": "code", - "source": "exp.obs.keys()", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 5, - "outputs": [ - { - "execution_count": 5, - "output_type": "execute_result", - "data": { - "text/plain": "('soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain')" - }, - "metadata": {} - } - ], - "id": "d67dfbc6-0382-4acc-8c56-3670549654f8" - }, - { - "cell_type": "code", - "source": "p = exp.obs.read(column_names=['louvain']).concat().to_pandas()\np", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 6, - "outputs": [ - { - "execution_count": 6, - "output_type": "execute_result", - "data": { - "text/plain": " louvain\n0 CD4 T cells\n1 B cells\n2 CD4 T cells\n3 CD14+ Monocytes\n4 NK cells\n... ...\n2633 CD14+ Monocytes\n2634 B cells\n2635 B cells\n2636 B cells\n2637 CD4 T cells\n\n[2638 rows x 1 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
louvain
0CD4 T cells
1B cells
2CD4 T cells
3CD14+ Monocytes
4NK cells
......
2633CD14+ Monocytes
2634B cells
2635B cells
2636B cells
2637CD4 T cells
\n

2638 rows × 1 columns

\n
" - }, - "metadata": {} - } - ], - "id": "9e4ede09-2303-4c21-92c1-bf42ed4e7dd1" - }, - { - "cell_type": "markdown", - "source": "Focusing on the `louvain` column, we can now find out what column values are present in the data.", - "metadata": { - "tags": [] - }, - "id": "f305fb7c" - }, - { - "cell_type": "code", - "source": "p.groupby('louvain').size().sort_values()", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 7, - "outputs": [ - { - "execution_count": 7, - "output_type": "execute_result", - "data": { - "text/plain": "louvain\nMegakaryocytes 15\nDendritic cells 37\nFCGR3A+ Monocytes 150\nNK cells 154\nCD8 T cells 316\nB cells 342\nCD14+ Monocytes 480\nCD4 T cells 1144\ndtype: int64" - }, - "metadata": {} - } - ], - "id": "00f1ccad-3ee2-4947-8961-8bf9642fbbba" - }, - { - "cell_type": "markdown", - "source": "Now we can query the SOMA experiment -- here, by a few cell types.", - "metadata": { - "tags": [] - }, - "id": "fda99535" - }, - { - "cell_type": "code", - "source": "obs_query = soma.AxisQuery(value_filter='louvain in [\"B cells\", \"NK cells\"]')", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 8, - "outputs": [], - "id": "e2ed76ca-5821-44c5-a220-ff96568686ec" - }, - { - "cell_type": "code", - "source": "query = exp.axis_query(\"RNA\", obs_query=obs_query)", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 9, - "outputs": [], - "id": "f3af70bc-3817-453c-a18c-56dc9aa874da" - }, - { - "cell_type": "markdown", - "source": "Note that the query output is smaller than the original dataset's size -- since we've queried for only a particular pair of cell types.", - "metadata": { - "tags": [] - }, - "id": "fb94d898" - }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3a5fd5d3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import tiledbsoma as soma" + ] + }, + { + "cell_type": "markdown", + "id": "ccc8709a", + "metadata": { + "tags": [] + }, + "source": [ + "In this notebook, we'll take a quick look at the SOMA experiment-query API. The dataset used is from Peripheral Blood Mononuclear Cells (PBMC), which is freely available from 10X Genomics.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9b8851d9-27f1-437b-a070-b41a65a5609e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "exp = soma.Experiment.open('data/sparse/pbmc3k')" + ] + }, + { + "cell_type": "markdown", + "id": "fab7898c", + "metadata": { + "tags": [] + }, + "source": [ + "Using the keys of the `obs` dataframe, we can see what fields are available to query on." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d67dfbc6-0382-4acc-8c56-3670549654f8", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "source": "[exp.obs.count, exp.ms[\"RNA\"].var.count]", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 10, - "outputs": [ - { - "execution_count": 10, - "output_type": "execute_result", - "data": { - "text/plain": "[2638, 1838]" - }, - "metadata": {} - } - ], - "id": "2c60568b-0789-4dbf-aff9-4bea2860aef4" - }, + "data": { + "text/plain": [ + "('soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp.obs.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9e4ede09-2303-4c21-92c1-bf42ed4e7dd1", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "source": "[query.n_obs, query.n_vars]", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 11, - "outputs": [ - { - "execution_count": 11, - "output_type": "execute_result", - "data": { - "text/plain": "[496, 1838]" - }, - "metadata": {} - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
louvain
0CD4 T cells
1B cells
2CD4 T cells
3CD14+ Monocytes
4NK cells
......
2633CD14+ Monocytes
2634B cells
2635B cells
2636B cells
2637CD4 T cells
\n", + "

2638 rows × 1 columns

\n", + "
" ], - "id": "28ed8d40-36c5-4642-bd8f-53d35c3074f0" - }, - { - "cell_type": "markdown", - "source": "Here we can take a look at the X data.", - "metadata": { - "tags": [] - }, - "id": "c9625771" - }, + "text/plain": [ + " louvain\n", + "0 CD4 T cells\n", + "1 B cells\n", + "2 CD4 T cells\n", + "3 CD14+ Monocytes\n", + "4 NK cells\n", + "... ...\n", + "2633 CD14+ Monocytes\n", + "2634 B cells\n", + "2635 B cells\n", + "2636 B cells\n", + "2637 CD4 T cells\n", + "\n", + "[2638 rows x 1 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = exp.obs.read(column_names=['louvain']).concat().to_pandas()\n", + "p" + ] + }, + { + "cell_type": "markdown", + "id": "f305fb7c", + "metadata": { + "tags": [] + }, + "source": [ + "Focusing on the `louvain` column, we can now find out what column values are present in the data." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "00f1ccad-3ee2-4947-8961-8bf9642fbbba", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "source": "query.X(\"data\").tables().concat().to_pandas()", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 12, - "outputs": [ - { - "execution_count": 12, - "output_type": "execute_result", - "data": { - "text/plain": " soma_dim_0 soma_dim_1 soma_data\n0 1 0 -0.214582\n1 1 1 -0.372653\n2 1 2 -0.054804\n3 1 3 -0.683391\n4 1 4 0.633951\n... ... ... ...\n911643 2636 1833 -0.149789\n911644 2636 1834 -0.325824\n911645 2636 1835 -0.005918\n911646 2636 1836 -0.135213\n911647 2636 1837 -0.482111\n\n[911648 rows x 3 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
soma_dim_0soma_dim_1soma_data
010-0.214582
111-0.372653
212-0.054804
313-0.683391
4140.633951
............
91164326361833-0.149789
91164426361834-0.325824
91164526361835-0.005918
91164626361836-0.135213
91164726361837-0.482111
\n

911648 rows × 3 columns

\n
" - }, - "metadata": {} - } - ], - "id": "65063167-5015-497a-9712-d72c0ecac2ed" - }, + "data": { + "text/plain": [ + "louvain\n", + "Megakaryocytes 15\n", + "Dendritic cells 37\n", + "FCGR3A+ Monocytes 150\n", + "NK cells 154\n", + "CD8 T cells 316\n", + "B cells 342\n", + "CD14+ Monocytes 480\n", + "CD4 T cells 1144\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p.groupby('louvain').size().sort_values()" + ] + }, + { + "cell_type": "markdown", + "id": "fda99535", + "metadata": { + "tags": [] + }, + "source": [ + "Now we can query the SOMA experiment -- here, by a few cell types." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e2ed76ca-5821-44c5-a220-ff96568686ec", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "obs_query = soma.AxisQuery(value_filter='louvain in [\"B cells\", \"NK cells\"]')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3af70bc-3817-453c-a18c-56dc9aa874da", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query = exp.axis_query(\"RNA\", obs_query=obs_query)" + ] + }, + { + "cell_type": "markdown", + "id": "fb94d898", + "metadata": { + "tags": [] + }, + "source": [ + "Note that the query output is smaller than the original dataset's size -- since we've queried for only a particular pair of cell types." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2c60568b-0789-4dbf-aff9-4bea2860aef4", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "source": "To finish out this introductory look at the experiment-query API, we can convert our query outputs to AnnData format.", - "metadata": { - "tags": [] - }, - "id": "db7af8b8" - }, + "data": { + "text/plain": [ + "[2638, 1838]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[exp.obs.count, exp.ms[\"RNA\"].var.count]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "28ed8d40-36c5-4642-bd8f-53d35c3074f0", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "source": "adata = query.to_anndata(X_name=\"data\")", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 13, - "outputs": [], - "id": "1ed8510b-343a-4f88-8aae-11a5c2069311" - }, + "data": { + "text/plain": [ + "[496, 1838]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[query.n_obs, query.n_vars]" + ] + }, + { + "cell_type": "markdown", + "id": "c9625771", + "metadata": { + "tags": [] + }, + "source": [ + "Here we can take a look at the X data." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "65063167-5015-497a-9712-d72c0ecac2ed", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "source": "adata", - "metadata": { - "tags": [], - "trusted": true - }, - "execution_count": 14, - "outputs": [ - { - "execution_count": 14, - "output_type": "execute_result", - "data": { - "text/plain": "AnnData object with n_obs × n_vars = 496 × 1838\n obs: 'soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain'\n var: 'soma_joinid', 'var_id', 'n_cells'" - }, - "metadata": {} - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
soma_dim_0soma_dim_1soma_data
010-0.214582
111-0.372653
212-0.054804
313-0.683391
4140.633951
............
91164326361833-0.149789
91164426361834-0.325824
91164526361835-0.005918
91164626361836-0.135213
91164726361837-0.482111
\n", + "

911648 rows × 3 columns

\n", + "
" ], - "id": "b3118504-8c92-48d4-9b83-87176960e4f1" - }, + "text/plain": [ + " soma_dim_0 soma_dim_1 soma_data\n", + "0 1 0 -0.214582\n", + "1 1 1 -0.372653\n", + "2 1 2 -0.054804\n", + "3 1 3 -0.683391\n", + "4 1 4 0.633951\n", + "... ... ... ...\n", + "911643 2636 1833 -0.149789\n", + "911644 2636 1834 -0.325824\n", + "911645 2636 1835 -0.005918\n", + "911646 2636 1836 -0.135213\n", + "911647 2636 1837 -0.482111\n", + "\n", + "[911648 rows x 3 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query.X(\"data\").tables().concat().to_pandas()" + ] + }, + { + "cell_type": "markdown", + "id": "db7af8b8", + "metadata": { + "tags": [] + }, + "source": [ + "To finish out this introductory look at the experiment-query API, we can convert our query outputs to AnnData format." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1ed8510b-343a-4f88-8aae-11a5c2069311", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "adata = query.to_anndata(X_name=\"data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b3118504-8c92-48d4-9b83-87176960e4f1", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "source": "", - "metadata": { - "trusted": true - }, - "execution_count": null, - "outputs": [], - "id": "f2e46ce1-cf7a-43c2-9f0d-bf918fd806bc" + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 496 × 1838\n", + " obs: 'soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain'\n", + " var: 'soma_joinid', 'var_id', 'n_cells'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } - ] + ], + "source": [ + "adata" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/apis/python/notebooks/tutorial_soma_append_mode.ipynb b/apis/python/notebooks/tutorial_soma_append_mode.ipynb index d33d12f0f1..a69a40a0f6 100644 --- a/apis/python/notebooks/tutorial_soma_append_mode.ipynb +++ b/apis/python/notebooks/tutorial_soma_append_mode.ipynb @@ -1,1150 +1,1261 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "# Tutorial: TileDB-SOMA append-mode" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "As of TileDB-SOMA 1.5.0, we're excited to offer support for append mode.\n", - "\n", - "Use-cases include ingesting H5AD/AnnData from multiple sequencing runs over time, accumulating the data over time, into millions of cells." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "First, we'll do the usual package imports:" - ] - }, + "cells": [ + { + "cell_type": "markdown", + "id": "36a0b22b", + "metadata": { + "tags": [] + }, + "source": [ + "# Tutorial: TileDB-SOMA append-mode" + ] + }, + { + "cell_type": "markdown", + "id": "69de8627", + "metadata": { + "tags": [] + }, + "source": [ + "As of TileDB-SOMA 1.5.0, we're excited to offer support for append mode.\n", + "\n", + "Use-cases include ingesting H5AD/AnnData from multiple sequencing runs over time, accumulating the data over time, into millions of cells." + ] + }, + { + "cell_type": "markdown", + "id": "2a218461", + "metadata": { + "tags": [] + }, + "source": [ + "First, we'll do the usual package imports:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d6b81174", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tiledbsoma.__version__ 1.11.1\n", - "TileDB-Py version 0.29.0\n", - "TileDB core version (tiledb) 2.23.0\n", - "TileDB core version (libtiledbsoma) 2.23.0\n", - "python version 3.11.8.final.0\n", - "OS version Linux 4.14.343-261.564.amzn2.x86_64\n" - ] - } - ], - "source": [ - "import scanpy as sc\n", - "import tiledbsoma\n", - "import tiledbsoma.io\n", - "import tiledbsoma.logging\n", - "\n", - "tiledbsoma.show_package_versions()" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "tiledbsoma.__version__ 1.15.0rc0.post92.dev1796808282\n", + "TileDB core version (libtiledbsoma) 2.27.0\n", + "python version 3.11.9.final.0\n", + "OS version Darwin 24.1.0\n" + ] + } + ], + "source": [ + "import scanpy as sc\n", + "import tiledbsoma\n", + "import tiledbsoma.io\n", + "import tiledbsoma.logging\n", + "\n", + "tiledbsoma.show_package_versions()" + ] + }, + { + "cell_type": "markdown", + "id": "a7a65011", + "metadata": { + "tags": [] + }, + "source": [ + "Next we'll set up where our data are going:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "24108e1c", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Next we'll set up where our data are going:" + "data": { + "text/plain": [ + "'/tmp/append-example-20241103-094551'" ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import datetime\n", + "\n", + "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n", + "experiment_uri = f\"/tmp/append-example-{stamp}\"\n", + "experiment_uri" + ] + }, + { + "cell_type": "markdown", + "id": "e835c440", + "metadata": { + "tags": [] + }, + "source": [ + "For this demo, we're writing to `/tmp`, but URIs like the following allow storing data on TileDB Cloud, cloud storage such as S3, or instance-local NVME:\n", + "\n", + "- `/var/data/mysoma1`\n", + "- `s3://mybucket/mysoma2`\n", + "- `tiledb://mynamespace/s3://mybucket/mysoma3`\n", + "\n", + "Everything in this notebook below this URI-selection cell is agnostic to the storage backend." + ] + }, + { + "cell_type": "markdown", + "id": "0ffee7b3", + "metadata": { + "tags": [] + }, + "source": [ + "## Create the initial SOMA Experiment" + ] + }, + { + "cell_type": "markdown", + "id": "bb3aa6c8", + "metadata": { + "tags": [] + }, + "source": [ + "Next we'll prep some input data. To make things easy for this self-contained demo, we'll use Scanpy's `pbmc3k`, with a custom column." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fe0e7a46", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'/tmp/append-example-20240521-145833'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import datetime\n", - "\n", - "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n", - "experiment_uri = f\"/tmp/append-example-{stamp}\"\n", - "experiment_uri" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████| 5.58M/5.58M [00:01<00:00, 4.18MB/s]\n" + ] + } + ], + "source": [ + "ad1 = sc.datasets.pbmc3k()\n", + "sc.pp.calculate_qc_metrics(ad1, inplace=True)\n", + "ad1.obs[\"when\"] = [\"Monday\"] * len(ad1.obs)" + ] + }, + { + "cell_type": "markdown", + "id": "88af955c", + "metadata": { + "tags": [] + }, + "source": [ + "Now we're ready to ingest the data into a SOMA experiment. Since SOMA is multimodal, we'll specify the destination modality, or measurement name, to be \"RNA\"." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "10cbd82b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "measurement_name = \"RNA\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a7c7914f", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "For this demo, we're writing to `/tmp`, but URIs like the following allow storing data on TileDB Cloud, cloud storage such as S3, or instance-local NVME:\n", - "\n", - "- `/var/data/mysoma1`\n", - "- `s3://mybucket/mysoma2`\n", - "- `tiledb://mynamespace/s3://mybucket/mysoma3`\n", - "\n", - "Everything in this notebook below this URI-selection cell is agnostic to the storage backend." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Registration: registering isolated AnnData object.\n", + "Wrote /tmp/append-example-20241103-094551/obs\n", + "Wrote /tmp/append-example-20241103-094551/ms/RNA/var\n", + "Writing /tmp/append-example-20241103-094551/ms/RNA/X/data\n", + "Wrote /tmp/append-example-20241103-094551/ms/RNA/X/data\n", + "Wrote /tmp/append-example-20241103-094551\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Create the initial SOMA Experiment" + "data": { + "text/plain": [ + "'/tmp/append-example-20241103-094551'" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Next we'll prep some input data. To make things easy for this self-contained demo, we'll use Scanpy's `pbmc3k`, with a custom column." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + ":119: FutureWarning: SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset.\n", + "\n", + "For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.\n", + "\n", + "For creation, use `anndata.experimental.sparse_dataset(X)` instead.\n", + "\n" + ] }, { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ad1 = sc.datasets.pbmc3k()\n", - "sc.pp.calculate_qc_metrics(ad1, inplace=True)\n", - "ad1.obs[\"when\"] = [\"Monday\"] * len(ad1.obs)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Now we're ready to ingest the data into a SOMA experiment. Since SOMA is multimodal, we'll specify the destination modality, or measurement name, to be \"RNA\"." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145833\n" + ] }, { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "measurement_name = \"RNA\"" + "data": { + "text/plain": [ + "'/tmp/append-example-20240521-145833'" ] - }, + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tiledbsoma.logging.info()\n", + "tiledbsoma.io.from_anndata(experiment_uri, ad1, measurement_name=measurement_name)" + ] + }, + { + "cell_type": "markdown", + "id": "2c993ff5", + "metadata": { + "tags": [] + }, + "source": [ + "Now let's read back the data. We'll take a look at `obs`, `var`, and `X`." + ] + }, + { + "cell_type": "markdown", + "id": "40c6b6f0", + "metadata": { + "tags": [] + }, + "source": [ + "**obs**: For this initial ingest, there are obs IDs ending in `-1`, the `when` is `Monday`, and there are 2700 rows. Also note that since TileDB is a columnar database, when we select certain columns, those are the only ones loaded from disk. This positively impacts performance at cloud scale." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d6ca5c9e", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: registering isolated AnnData object.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833/obs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833/ms/RNA/var\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing /tmp/append-example-20240521-145833/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":119: FutureWarning: SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset.\n", - "\n", - "For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.\n", - "\n", - "For creation, use `anndata.experimental.sparse_dataset(X)` instead.\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833\n" - ] - }, - { - "data": { - "text/plain": [ - "'/tmp/append-example-20240521-145833'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tiledbsoma.logging.info()\n", - "tiledbsoma.io.from_anndata(experiment_uri, ad1, measurement_name=measurement_name)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + " obs_id n_genes_by_counts when\n", + "0 AAACATACAACCAC-1 781 Monday\n", + "1 AAACATTGAGCTAC-1 1352 Monday\n", + "2 AAACATTGATCAGC-1 1131 Monday\n", + "3 AAACCGTGCTTCCG-1 960 Monday\n", + "4 AAACCGTGTATGCG-1 522 Monday\n", + "... ... ... ...\n", + "2695 TTTCGAACTCTCAT-1 1155 Monday\n", + "2696 TTTCTACTGAGGCA-1 1227 Monday\n", + "2697 TTTCTACTTCCTCG-1 622 Monday\n", + "2698 TTTGCATGAGAGGC-1 454 Monday\n", + "2699 TTTGCATGCCTCAC-1 724 Monday\n", + "\n", + "[2700 rows x 3 columns]\n" + ] + } + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " print(\n", + " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n", + " .concat()\n", + " .to_pandas()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "b8610b39", + "metadata": { + "tags": [] + }, + "source": [ + "**var**: Let's also look at `var`, selecting out the join IDs (which index columns of `X`) as well as the Ensembl-format and NCBI-format gene IDs:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "221c472f", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Now let's read back the data. We'll take a look at `obs`, `var`, and `X`." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + " soma_joinid var_id gene_ids\n", + "0 0 MIR1302-10 ENSG00000243485\n", + "1 1 FAM138A ENSG00000237613\n", + "2 2 OR4F5 ENSG00000186092\n", + "3 3 RP11-34P13.7 ENSG00000238009\n", + "4 4 RP11-34P13.8 ENSG00000239945\n", + "... ... ... ...\n", + "32733 32733 AC145205.1 ENSG00000215635\n", + "32734 32734 BAGE5 ENSG00000268590\n", + "32735 32735 CU459201.1 ENSG00000251180\n", + "32736 32736 AC002321.2 ENSG00000215616\n", + "32737 32737 AC002321.1 ENSG00000215611\n", + "\n", + "[32738 rows x 3 columns]\n" + ] + } + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " print(\n", + " exp.ms[\"RNA\"]\n", + " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n", + " .concat()\n", + " .to_pandas()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "2e74cd29", + "metadata": { + "tags": [] + }, + "source": [ + "**X**: Lastly let's look at the expression matrix, in COO format. (You can convert to other formats if you like.) Its rows and columns are indexed by the `soma_joinid` of the `obs` and `var` dataframes, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69ba087b-cb5f-4851-8ee6-3a4d828d70a6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " X = exp.ms[\"RNA\"].X[\"data\"]\n", + " print(X.read().tables().concat().to_pandas())\n", + " print()\n", + " print(X.used_shape())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bc5b5740-79a4-4831-8549-f06ac079fb02", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "**obs**: For this initial ingest, there are obs IDs ending in `-1`, the `when` is `Monday`, and there are 2700 rows. Also note that since TileDB is a columnar database, when we select certain columns, those are the only ones loaded from disk. This positively impacts performance at cloud scale." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[DataFrame] obs \n", + " URI file:///tmp/append-example-20241103-094551/obs\n", + " count 2700\n", + " domain ((0, 2699),)\n", + " maxdomain ((0, 9223372036854773758),)\n", + " upgraded True\n", + "\n", + "[DataFrame] ms/RNA/var \n", + " URI file:///tmp/append-example-20241103-094551/ms/RNA/var\n", + " count 32738\n", + " domain ((0, 32737),)\n", + " maxdomain ((0, 9223372036854773758),)\n", + " upgraded True\n", + "\n", + "[SparseNDArray] ms/RNA/X/data \n", + " URI file:///tmp/append-example-20241103-094551/ms/RNA/X/data\n", + " used_shape ((0, 2699), (0, 32732))\n", + " shape (2700, 32738)\n", + " maxshape (9223372036854773759, 9223372036854773759)\n", + " upgraded True\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id n_genes_by_counts when\n", - "0 AAACATACAACCAC-1 781 Monday\n", - "1 AAACATTGAGCTAC-1 1352 Monday\n", - "2 AAACATTGATCAGC-1 1131 Monday\n", - "3 AAACCGTGCTTCCG-1 960 Monday\n", - "4 AAACCGTGTATGCG-1 522 Monday\n", - "... ... ... ...\n", - "2695 TTTCGAACTCTCAT-1 1155 Monday\n", - "2696 TTTCTACTGAGGCA-1 1227 Monday\n", - "2697 TTTCTACTTCCTCG-1 622 Monday\n", - "2698 TTTGCATGAGAGGC-1 454 Monday\n", - "2699 TTTGCATGCCTCAC-1 724 Monday\n", - "\n", - "[2700 rows x 3 columns]\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " print(\n", - " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n", - " .concat()\n", - " .to_pandas()\n", - " )" + "data": { + "text/plain": [ + "True" ] - }, + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tiledbsoma.io.show_experiment_shapes(exp.uri)" + ] + }, + { + "cell_type": "markdown", + "id": "cd08018e", + "metadata": { + "tags": [] + }, + "source": [ + "## Appending a new dataset to the SOMA Experiment" + ] + }, + { + "cell_type": "markdown", + "id": "10f03631", + "metadata": { + "tags": [] + }, + "source": [ + "Now, let's simiulate another day's sequencing run. For simplicity of this demo notebook, we'll mutate the previous dataset, changing the obs IDs to have a `-2` suffix, and also putting `Tuesday` in the `when` column. Also, we'll multiply the `X` values by 10." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "81ca4031", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ad2 = ad1.copy()\n", + "ad2.obs.index = [e.replace(\"-1\", \"-2\") for e in ad1.obs.index]\n", + "ad2.obs[\"when\"] = [\"Tuesday\"] * len(ad2.obs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d703ebb7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ad2.X *= 10" + ] + }, + { + "cell_type": "markdown", + "id": "90e85660", + "metadata": { + "tags": [] + }, + "source": [ + "Now we simply ingest as before -- the only additional step is a black-box registration step which detects which cell IDs are new (here, all of them) and which gene IDs are new (here, none of them).\n", + "\n", + "The registration takes two forms, either of which you can use depending on your use-case: `tiledbsoma.io.register_anndatas` for in-memory AnnData objects, or `tiledbsoma.io.register_h5ads` for on-storage AnnData objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd0cf6c9-cd3c-4faf-84f5-8659cac7c4f5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# XXX TEMP\n", + "tiledbsoma.io.resize_experiment(exp.uri, nobs=rd.get_obs_shape(), nvars=rd.get_var_shapes())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f183088d-b428-47ce-99f6-c12157867357", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "**var**: Let's also look at `var`, selecting out the join IDs (which index columns of `X`) as well as the Ensembl-format and NCBI-format gene IDs:" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[DataFrame] obs \n", + " URI file:///tmp/append-example-20241103-094551/obs\n", + " count 5400\n", + " domain ((0, 5399),)\n", + " maxdomain ((0, 9223372036854773758),)\n", + " upgraded True\n", + "\n", + "[DataFrame] ms/RNA/var \n", + " URI file:///tmp/append-example-20241103-094551/ms/RNA/var\n", + " count 32738\n", + " domain ((0, 32737),)\n", + " maxdomain ((0, 9223372036854773758),)\n", + " upgraded True\n", + "\n", + "[SparseNDArray] ms/RNA/X/data \n", + " URI file:///tmp/append-example-20241103-094551/ms/RNA/X/data\n", + " used_shape ((0, 5399), (0, 32732))\n", + " shape (5400, 32738)\n", + " maxshape (9223372036854773759, 9223372036854773759)\n", + " upgraded True\n" + ] }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " soma_joinid var_id gene_ids\n", - "0 0 MIR1302-10 ENSG00000243485\n", - "1 1 FAM138A ENSG00000237613\n", - "2 2 OR4F5 ENSG00000186092\n", - "3 3 RP11-34P13.7 ENSG00000238009\n", - "4 4 RP11-34P13.8 ENSG00000239945\n", - "... ... ... ...\n", - "32733 32733 AC145205.1 ENSG00000215635\n", - "32734 32734 BAGE5 ENSG00000268590\n", - "32735 32735 CU459201.1 ENSG00000251180\n", - "32736 32736 AC002321.2 ENSG00000215616\n", - "32737 32737 AC002321.1 ENSG00000215611\n", - "\n", - "[32738 rows x 3 columns]\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " print(\n", - " exp.ms[\"RNA\"]\n", - " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n", - " .concat()\n", - " .to_pandas()\n", - " )" + "data": { + "text/plain": [ + "True" ] - }, + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# XXX TEMP\n", + "tiledbsoma.io.show_experiment_shapes(exp.uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2ee7c0e4-eb36-482e-a37e-7c786607b76a", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "**X**: Lastly let's look at the expression matrix, in COO format. (You can convert to other formats if you like.) Its rows and columns are indexed by the `soma_joinid` of the `obs` and `var` dataframes, respectively." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Registration: starting with experiment /tmp/append-example-20241103-094551\n", + "Registration: found nobs=2700 nvar=32738 from experiment.\n", + "Registration: registering AnnData object.\n", + "Registration: accumulated to nobs=5400 nvar=32738.\n", + "Registration: complete.\n", + "Wrote /tmp/append-example-20241103-094551/obs\n", + "Wrote /tmp/append-example-20241103-094551/ms/RNA/var\n", + "Writing /tmp/append-example-20241103-094551/ms/RNA/X/data\n", + "Wrote /tmp/append-example-20241103-094551/ms/RNA/X/data\n", + "Wrote file:///tmp/append-example-20241103-094551\n" + ] }, { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " soma_dim_0 soma_dim_1 soma_data\n", - "0 0 70 1.0\n", - "1 0 166 1.0\n", - "2 0 178 2.0\n", - "3 0 326 1.0\n", - "4 0 363 1.0\n", - "... ... ... ...\n", - "2286879 2699 32697 1.0\n", - "2286880 2699 32698 7.0\n", - "2286881 2699 32702 1.0\n", - "2286882 2699 32705 1.0\n", - "2286883 2699 32708 3.0\n", - "\n", - "[2286884 rows x 3 columns]\n", - "\n", - "((0, 2699), (0, 32732))\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " X = exp.ms[\"RNA\"].X[\"data\"]\n", - " print(X.read().tables().concat().to_pandas())\n", - " print()\n", - " print(X.used_shape())" + "data": { + "text/plain": [ + "'file:///tmp/append-example-20241103-094551'" ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Appending a new dataset to the SOMA Experiment" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Now, let's simiulate another day's sequencing run. For simplicity of this demo notebook, we'll mutate the previous dataset, changing the obs IDs to have a `-2` suffix, and also putting `Tuesday` in the `when` column. Also, we'll multiply the `X` values by 10." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145833\n" + ] }, { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ad2 = ad1.copy()\n", - "ad2.obs.index = [e.replace(\"-1\", \"-2\") for e in ad1.obs.index]\n", - "ad2.obs[\"when\"] = [\"Tuesday\"] * len(ad2.obs)" + "data": { + "text/plain": [ + "'/tmp/append-example-20240521-145833'" ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ad2.X *= 10" - ] - }, + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rd = tiledbsoma.io.register_anndatas(\n", + " experiment_uri,\n", + " [ad2],\n", + " measurement_name=measurement_name,\n", + " obs_field_name=\"obs_id\",\n", + " var_field_name=\"var_id\",\n", + ")\n", + "\n", + "tiledbsoma.io.from_anndata(\n", + " experiment_uri,\n", + " ad2,\n", + " measurement_name=measurement_name,\n", + " registration_mapping=rd,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "53d07733", + "metadata": { + "tags": [] + }, + "source": [ + "Now let's read back the appended data. There are now obs IDs ending in `-1` as well as `-2`, the `when` includes `Monday` as well as `Tuesday`, and there are 5400 rows.\n", + "\n", + "(For `Wednesday` and onward, it'll simply be the same pattern -- we can grow our data iteratively over time, to arbitrary sizes.)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a7b2aebe", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Now we simply ingest as before -- the only additional step is a black-box registration step which detects which cell IDs are new (here, all of them) and which gene IDs are new (here, none of them).\n", - "\n", - "The registration takes two forms, either of which you can use depending on your use-case: `tiledbsoma.io.register_anndatas` for in-memory AnnData objects, or `tiledbsoma.io.register_h5ads` for on-storage AnnData objects." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + " obs_id n_genes_by_counts when\n", + "0 AAACATACAACCAC-1 781 Monday\n", + "1 AAACATTGAGCTAC-1 1352 Monday\n", + "2 AAACATTGATCAGC-1 1131 Monday\n", + "3 AAACCGTGCTTCCG-1 960 Monday\n", + "4 AAACCGTGTATGCG-1 522 Monday\n", + "... ... ... ...\n", + "5395 TTTCGAACTCTCAT-2 1155 Tuesday\n", + "5396 TTTCTACTGAGGCA-2 1227 Tuesday\n", + "5397 TTTCTACTTCCTCG-2 622 Tuesday\n", + "5398 TTTGCATGAGAGGC-2 454 Tuesday\n", + "5399 TTTGCATGCCTCAC-2 724 Tuesday\n", + "\n", + "[5400 rows x 3 columns]\n" + ] + } + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " print(\n", + " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n", + " .concat()\n", + " .to_pandas()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "c8bc7cd2", + "metadata": { + "tags": [] + }, + "source": [ + "Let's also look at `var`, as before. Since we had data for more cells but for the same genes, there is nothing new here. The `obs` table grew downward with the new cells, and `X` grew downward with new rows, but `var` stayed the same.\n", + "\n", + "In real-world data, occasionally you will see a gene expressed in subsequent data which wasn't expressed in the initial data. That's fine -- you'll simply see `var` grow just a bit for those newly encountered gene IDs, with corresponding new columns for `X`." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4a1cc20e", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: starting with experiment /tmp/append-example-20240521-145833\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: found nobs=2700 nvar=32738 from experiment.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: registering AnnData object.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: accumulated to nobs=5400 nvar=32738.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: complete.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833/obs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833/ms/RNA/var\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing /tmp/append-example-20240521-145833/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145833\n" - ] - }, - { - "data": { - "text/plain": [ - "'/tmp/append-example-20240521-145833'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rd = tiledbsoma.io.register_anndatas(\n", - " experiment_uri,\n", - " [ad2],\n", - " measurement_name=measurement_name,\n", - " obs_field_name=\"obs_id\",\n", - " var_field_name=\"var_id\",\n", - ")\n", - "\n", - "tiledbsoma.io.from_anndata(\n", - " experiment_uri,\n", - " ad2,\n", - " measurement_name=measurement_name,\n", - " registration_mapping=rd,\n", - ")" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + " soma_joinid var_id gene_ids\n", + "0 0 MIR1302-10 ENSG00000243485\n", + "1 1 FAM138A ENSG00000237613\n", + "2 2 OR4F5 ENSG00000186092\n", + "3 3 RP11-34P13.7 ENSG00000238009\n", + "4 4 RP11-34P13.8 ENSG00000239945\n", + "... ... ... ...\n", + "32733 32733 AC145205.1 ENSG00000215635\n", + "32734 32734 BAGE5 ENSG00000268590\n", + "32735 32735 CU459201.1 ENSG00000251180\n", + "32736 32736 AC002321.2 ENSG00000215616\n", + "32737 32737 AC002321.1 ENSG00000215611\n", + "\n", + "[32738 rows x 3 columns]\n" + ] + } + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " print(\n", + " exp.ms[\"RNA\"]\n", + " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n", + " .concat()\n", + " .to_pandas()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "499785d6", + "metadata": { + "tags": [] + }, + "source": [ + "And lastly, the `X` expression matrix which has grown downward with the new cells, while keeping the same width as we didn't introduce new genes:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "d640bde0", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Now let's read back the appended data. There are now obs IDs ending in `-1` as well as `-2`, the `when` includes `Monday` as well as `Tuesday`, and there are 5400 rows.\n", - "\n", - "(For `Wednesday` and onward, it'll simply be the same pattern -- we can grow our data iteratively over time, to arbitrary sizes.)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + " soma_dim_0 soma_dim_1 soma_data\n", + "0 0 70 1.0\n", + "1 0 166 1.0\n", + "2 0 178 2.0\n", + "3 0 326 1.0\n", + "4 0 363 1.0\n", + "... ... ... ...\n", + "4573763 5399 32697 10.0\n", + "4573764 5399 32698 70.0\n", + "4573765 5399 32702 10.0\n", + "4573766 5399 32705 10.0\n", + "4573767 5399 32708 30.0\n", + "\n", + "[4573768 rows x 3 columns]\n", + "\n", + "((0, 5399), (0, 32732))\n" + ] + } + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " X = exp.ms[\"RNA\"].X[\"data\"]\n", + " print(X.read().tables().concat().to_pandas())\n", + " print()\n", + " print(X.used_shape())" + ] + }, + { + "cell_type": "markdown", + "id": "290da7f2", + "metadata": { + "tags": [] + }, + "source": [ + "## Ingesting multiple datasets to a SOMA Experiment" + ] + }, + { + "cell_type": "markdown", + "id": "5d812c64", + "metadata": { + "tags": [] + }, + "source": [ + "Finally, we'll demonstrate combining multiple AnnDatas into one new experiment.\n", + "\n", + "The flow is pretty similar to the above:\n", + "\n", + "1. One call to `register_anndatas` or `register_h5ads` (passing all input AnnDatas/h5ads)\n", + "2. One call to `from_anndata`/`from_h5ad` *for each input AnnData*\n", + "\n", + "Here's a helper function to simulate multiple lab runs. As above, where we used `pbmc3k` to simulate Monday and Tuesday data, here we use `pbmc3k` to simulate multiple AnnData objects." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c3c185fb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def make_ad(when, scale, obs_id_suffix):\n", + " ad = ad1.copy()\n", + " ad.obs.index = [e.replace(\"-1\", obs_id_suffix) for e in ad.obs.index]\n", + " ad.obs[\"when\"] = [when] * len(ad.obs)\n", + " ad.X *= scale\n", + " return ad\n", + "\n", + "ads = [\n", + " make_ad(when, scale, f\"-{idx + 3}\")\n", + " for idx, (when, scale)\n", + " in enumerate({\n", + " \"Wednesday\": 20,\n", + " \"Thursday\": 30,\n", + " \"Friday\": 40,\n", + " }.items())\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "7da62a10", + "metadata": { + "tags": [] + }, + "source": [ + "We'll ingest these AnnData objects, as before, but this time to a fresh/empty `/tmp` location:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ae2d62ae", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id n_genes_by_counts when\n", - "0 AAACATACAACCAC-1 781 Monday\n", - "1 AAACATTGAGCTAC-1 1352 Monday\n", - "2 AAACATTGATCAGC-1 1131 Monday\n", - "3 AAACCGTGCTTCCG-1 960 Monday\n", - "4 AAACCGTGTATGCG-1 522 Monday\n", - "... ... ... ...\n", - "5395 TTTCGAACTCTCAT-2 1155 Tuesday\n", - "5396 TTTCTACTGAGGCA-2 1227 Tuesday\n", - "5397 TTTCTACTTCCTCG-2 622 Tuesday\n", - "5398 TTTGCATGAGAGGC-2 454 Tuesday\n", - "5399 TTTGCATGCCTCAC-2 724 Tuesday\n", - "\n", - "[5400 rows x 3 columns]\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " print(\n", - " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n", - " .concat()\n", - " .to_pandas()\n", - " )" + "data": { + "text/plain": [ + "'/tmp/append-example-20241103-095132'" ] - }, + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n", + "experiment_uri = f\"/tmp/append-example-{stamp}\"\n", + "experiment_uri" + ] + }, + { + "cell_type": "markdown", + "id": "b89fe0ee", + "metadata": { + "tags": [] + }, + "source": [ + "Here we'll register all the AnnData objects. Note that the SOMA Experiment doesn't exist yet, so we pass `experiment_uri=None` to signify that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac21dd19-2fd5-41ec-98e5-2596e0795f0d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "rd2 = tiledbsoma.io.register_anndatas(\n", + " experiment_uri=None, # new Experiment, from scratch\n", + " adatas=ads,\n", + " measurement_name=measurement_name,\n", + " obs_field_name=\"obs_id\",\n", + " var_field_name=\"var_id\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d6cff24a-1de8-46e1-b502-e69a0e92dc92", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Let's also look at `var`, as before. Since we had data for more cells but for the same genes, there is nothing new here. The `obs` table grew downward with the new cells, and `X` grew downward with new rows, but `var` stayed the same.\n", - "\n", - "In real-world data, occasionally you will see a gene expressed in subsequent data which wasn't expressed in the initial data. That's fine -- you'll simply see `var` grow just a bit for those newly encountered gene IDs, with corresponding new columns for `X`." + "data": { + "text/plain": [ + "True" ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" }, { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " soma_joinid var_id gene_ids\n", - "0 0 MIR1302-10 ENSG00000243485\n", - "1 1 FAM138A ENSG00000237613\n", - "2 2 OR4F5 ENSG00000186092\n", - "3 3 RP11-34P13.7 ENSG00000238009\n", - "4 4 RP11-34P13.8 ENSG00000239945\n", - "... ... ... ...\n", - "32733 32733 AC145205.1 ENSG00000215635\n", - "32734 32734 BAGE5 ENSG00000268590\n", - "32735 32735 CU459201.1 ENSG00000251180\n", - "32736 32736 AC002321.2 ENSG00000215616\n", - "32737 32737 AC002321.1 ENSG00000215611\n", - "\n", - "[32738 rows x 3 columns]\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " print(\n", - " exp.ms[\"RNA\"]\n", - " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n", - " .concat()\n", - " .to_pandas()\n", - " )" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Registration: registering AnnData object.\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "And lastly, the `X` expression matrix which has grown downward with the new cells, while keeping the same width as we didn't introduce new genes:" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Registration: accumulated to nobs=8100 nvar=32738.\n" + ] }, { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " soma_dim_0 soma_dim_1 soma_data\n", - "0 0 70 1.0\n", - "1 0 166 1.0\n", - "2 0 178 2.0\n", - "3 0 326 1.0\n", - "4 0 363 1.0\n", - "... ... ... ...\n", - "4573763 5399 32697 10.0\n", - "4573764 5399 32698 70.0\n", - "4573765 5399 32702 10.0\n", - "4573766 5399 32705 10.0\n", - "4573767 5399 32708 30.0\n", - "\n", - "[4573768 rows x 3 columns]\n", - "\n", - "((0, 5399), (0, 32732))\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " X = exp.ms[\"RNA\"].X[\"data\"]\n", - " print(X.read().tables().concat().to_pandas())\n", - " print()\n", - " print(X.used_shape())" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "Registration: complete.\n" + ] + } + ], + "source": [ + "# XXX TEMP\n", + "tiledbsoma.io.resize_experiment(exp.uri, nobs=rd2.get_obs_shape(), nvars=rd2.get_var_shapes())" + ] + }, + { + "cell_type": "markdown", + "id": "77429cf0", + "metadata": { + "tags": [] + }, + "source": [ + "Now that we've gotten the registrations for all the input AnnData objects, we can ingest them.\n", + "\n", + "Note:\n", + "\n", + "- Here we ingest them sequentially, in the same order as above.\n", + "- But we could also ingest them in any shuffled order.\n", + "- Or we could have multiple workers in ingest them in parallel, one worker per AnnData object, as long as the registration data are passed to each worker." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "27ed22b2", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "## Ingesting multiple datasets to a SOMA Experiment" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20241103-095132/obs\n", + "Wrote /tmp/append-example-20241103-095132/ms/RNA/var\n", + "Writing /tmp/append-example-20241103-095132/ms/RNA/X/data\n", + "Wrote /tmp/append-example-20241103-095132/ms/RNA/X/data\n", + "Wrote file:///tmp/append-example-20241103-095132\n", + "Wrote /tmp/append-example-20241103-095132/obs\n", + "Wrote /tmp/append-example-20241103-095132/ms/RNA/var\n", + "Writing /tmp/append-example-20241103-095132/ms/RNA/X/data\n", + "Wrote /tmp/append-example-20241103-095132/ms/RNA/X/data\n", + "Wrote file:///tmp/append-example-20241103-095132\n", + "Wrote /tmp/append-example-20241103-095132/obs\n", + "Wrote /tmp/append-example-20241103-095132/ms/RNA/var\n", + "Writing /tmp/append-example-20241103-095132/ms/RNA/X/data\n", + "Wrote /tmp/append-example-20241103-095132/ms/RNA/X/data\n", + "Wrote file:///tmp/append-example-20241103-095132\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Finally, we'll demonstrate combining multiple AnnDatas into one new experiment.\n", - "\n", - "The flow is pretty similar to the above:\n", - "\n", - "1. One call to `register_anndatas` or `register_h5ads` (passing all input AnnDatas/h5ads)\n", - "2. One call to `from_anndata`/`from_h5ad` *for each input AnnData*\n", - "\n", - "Here's a helper function to simulate multiple lab runs. As above, where we used `pbmc3k` to simulate Monday and Tuesday data, here we use `pbmc3k` to simulate multiple AnnData objects." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n" + ] }, { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def make_ad(when, scale, obs_id_suffix):\n", - " ad = ad1.copy()\n", - " ad.obs.index = [e.replace(\"-1\", obs_id_suffix) for e in ad.obs.index]\n", - " ad.obs[\"when\"] = [when] * len(ad.obs)\n", - " ad.X *= scale\n", - " return ad\n", - "\n", - "ads = [\n", - " make_ad(when, scale, f\"-{idx + 3}\")\n", - " for idx, (when, scale)\n", - " in enumerate({\n", - " \"Wednesday\": 20,\n", - " \"Thursday\": 30,\n", - " \"Friday\": 40,\n", - " }.items())\n", - "]" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "We'll ingest these AnnData objects, as before, but this time to a fresh/empty `/tmp` location:" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839/obs\n" + ] }, { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'/tmp/append-example-20240521-145839'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n", - "experiment_uri = f\"/tmp/append-example-{stamp}\"\n", - "experiment_uri" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Here we'll register all the AnnData objects. Note that the SOMA Experiment doesn't exist yet, so we pass `experiment_uri=None` to signify that." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n" + ] }, { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: registering AnnData object.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: accumulated to nobs=2700 nvar=32738.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: registering AnnData object.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: accumulated to nobs=5400 nvar=32738.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: registering AnnData object.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: accumulated to nobs=8100 nvar=32738.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registration: complete.\n" - ] - } - ], - "source": [ - "rd2 = tiledbsoma.io.register_anndatas(\n", - " experiment_uri=None, # new Experiment, from scratch\n", - " adatas=ads,\n", - " measurement_name=measurement_name,\n", - " obs_field_name=\"obs_id\",\n", - " var_field_name=\"var_id\",\n", - ")" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Now that we've gotten the registrations for all the input AnnData objects, we can ingest them.\n", - "\n", - "Note:\n", - "\n", - "- Here we ingest them sequentially, in the same order as above.\n", - "- But we could also ingest them in any shuffled order.\n", - "- Or we could have multiple workers in ingest them in parallel, one worker per AnnData object, as long as the registration data are passed to each worker." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839\n" + ] }, { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/obs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/obs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/obs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Wrote /tmp/append-example-20240521-145839\n" - ] - } - ], - "source": [ - "for ad in ads:\n", - " tiledbsoma.io.from_anndata(\n", - " experiment_uri,\n", - " ad,\n", - " measurement_name=measurement_name,\n", - " registration_mapping=rd2,\n", - " )" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839/obs\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Reading back the concatenated data, we see 2700 rows for each of {`-3`, `-4`, `-5`}:" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n" + ] }, { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " obs_id n_genes_by_counts when\n", - "0 AAACATACAACCAC-3 781 Wednesday\n", - "1 AAACATTGAGCTAC-3 1352 Wednesday\n", - "2 AAACATTGATCAGC-3 1131 Wednesday\n", - "3 AAACCGTGCTTCCG-3 960 Wednesday\n", - "4 AAACCGTGTATGCG-3 522 Wednesday\n", - "... ... ... ...\n", - "8095 TTTCGAACTCTCAT-5 1155 Friday\n", - "8096 TTTCTACTGAGGCA-5 1227 Friday\n", - "8097 TTTCTACTTCCTCG-5 622 Friday\n", - "8098 TTTGCATGAGAGGC-5 454 Friday\n", - "8099 TTTGCATGCCTCAC-5 724 Friday\n", - "\n", - "[8100 rows x 3 columns]\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " print(\n", - " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n", - " .concat()\n", - " .to_pandas()\n", - " )" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "`var` is the same as in the single original Anndata objects (since we added more cells with all the same genes):" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n" + ] }, { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " soma_joinid var_id gene_ids\n", - "0 0 MIR1302-10 ENSG00000243485\n", - "1 1 FAM138A ENSG00000237613\n", - "2 2 OR4F5 ENSG00000186092\n", - "3 3 RP11-34P13.7 ENSG00000238009\n", - "4 4 RP11-34P13.8 ENSG00000239945\n", - "... ... ... ...\n", - "32733 32733 AC145205.1 ENSG00000215635\n", - "32734 32734 BAGE5 ENSG00000268590\n", - "32735 32735 CU459201.1 ENSG00000251180\n", - "32736 32736 AC002321.2 ENSG00000215616\n", - "32737 32737 AC002321.1 ENSG00000215611\n", - "\n", - "[32738 rows x 3 columns]\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " print(\n", - " exp.ms[\"RNA\"]\n", - " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n", - " .concat()\n", - " .to_pandas()\n", - " )" - ] - }, + "name": "stderr", + "output_type": "stream", + "text": [ + "Wrote /tmp/append-example-20240521-145839\n" + ] + } + ], + "source": [ + "for ad in ads:\n", + " tiledbsoma.io.from_anndata(\n", + " experiment_uri,\n", + " ad,\n", + " measurement_name=measurement_name,\n", + " registration_mapping=rd2,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "e2e54f89", + "metadata": { + "tags": [] + }, + "source": [ + "Reading back the concatenated data, we see 2700 rows for each of {`-3`, `-4`, `-5`}:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "8f86fd3d", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Finally, the `X` expression matrix contains 3x the entries as the original, but is also the same width (since we didn't introduce new genes):" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + " obs_id n_genes_by_counts when\n", + "0 AAACATACAACCAC-3 781 Wednesday\n", + "1 AAACATTGAGCTAC-3 1352 Wednesday\n", + "2 AAACATTGATCAGC-3 1131 Wednesday\n", + "3 AAACCGTGCTTCCG-3 960 Wednesday\n", + "4 AAACCGTGTATGCG-3 522 Wednesday\n", + "... ... ... ...\n", + "2695 TTTCGAACTCTCAT-3 1155 Wednesday\n", + "2696 TTTCTACTGAGGCA-3 1227 Wednesday\n", + "2697 TTTCTACTTCCTCG-3 622 Wednesday\n", + "2698 TTTGCATGAGAGGC-3 454 Wednesday\n", + "2699 TTTGCATGCCTCAC-3 724 Wednesday\n", + "\n", + "[2700 rows x 3 columns]\n" + ] + } + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " print(\n", + " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n", + " .concat()\n", + " .to_pandas()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "4f8596b0", + "metadata": { + "tags": [] + }, + "source": [ + "`var` is the same as in the single original Anndata objects (since we added more cells with all the same genes):" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "bffce533", + "metadata": { + "tags": [] + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " soma_dim_0 soma_dim_1 soma_data\n", - "0 0 70 20.0\n", - "1 0 166 20.0\n", - "2 0 178 40.0\n", - "3 0 326 20.0\n", - "4 0 363 20.0\n", - "... ... ... ...\n", - "6860647 8099 32697 40.0\n", - "6860648 8099 32698 280.0\n", - "6860649 8099 32702 40.0\n", - "6860650 8099 32705 40.0\n", - "6860651 8099 32708 120.0\n", - "\n", - "[6860652 rows x 3 columns]\n", - "\n", - "((0, 8099), (0, 32732))\n" - ] - } - ], - "source": [ - "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", - " X = exp.ms[\"RNA\"].X[\"data\"]\n", - " print(X.read().tables().concat().to_pandas())\n", - " print()\n", - " print(X.used_shape())" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + " soma_joinid var_id gene_ids\n", + "0 0 MIR1302-10 ENSG00000243485\n", + "1 1 FAM138A ENSG00000237613\n", + "2 2 OR4F5 ENSG00000186092\n", + "3 3 RP11-34P13.7 ENSG00000238009\n", + "4 4 RP11-34P13.8 ENSG00000239945\n", + "... ... ... ...\n", + "32733 32733 AC145205.1 ENSG00000215635\n", + "32734 32734 BAGE5 ENSG00000268590\n", + "32735 32735 CU459201.1 ENSG00000251180\n", + "32736 32736 AC002321.2 ENSG00000215616\n", + "32737 32737 AC002321.1 ENSG00000215611\n", + "\n", + "[32738 rows x 3 columns]\n" + ] } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " print(\n", + " exp.ms[\"RNA\"]\n", + " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n", + " .concat()\n", + " .to_pandas()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "9a9737a0", + "metadata": { + "tags": [] + }, + "source": [ + "Finally, the `X` expression matrix contains 3x the entries as the original, but is also the same width (since we didn't introduce new genes):" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "05cf63a0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " soma_dim_0 soma_dim_1 soma_data\n", + "0 0 70 20.0\n", + "1 0 166 20.0\n", + "2 0 178 40.0\n", + "3 0 326 20.0\n", + "4 0 363 20.0\n", + "... ... ... ...\n", + "6860647 8099 32697 40.0\n", + "6860648 8099 32698 280.0\n", + "6860649 8099 32702 40.0\n", + "6860650 8099 32705 40.0\n", + "6860651 8099 32708 120.0\n", + "\n", + "[6860652 rows x 3 columns]\n", + "\n", + "((0, 8099), (0, 32732))\n" + ] } + ], + "source": [ + "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n", + " X = exp.ms[\"RNA\"].X[\"data\"]\n", + " print(X.read().tables().concat().to_pandas())\n", + " print()\n", + " print(X.used_shape())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99e89175-335b-4ef8-95ec-290a702ae10d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 5 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/apis/python/notebooks/tutorial_soma_objects.ipynb b/apis/python/notebooks/tutorial_soma_objects.ipynb index 9071d7c677..a4bb0215b3 100644 --- a/apis/python/notebooks/tutorial_soma_objects.ipynb +++ b/apis/python/notebooks/tutorial_soma_objects.ipynb @@ -52,19 +52,19 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "c3af1793-e2be-45e1-8128-bb64536673f7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 5, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -86,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "228e9411-434e-4c55-8fb4-fef3216dca08", "metadata": { "tags": [] @@ -95,12 +95,12 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 6, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -111,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "5d92e331-5c6c-4971-b956-442996d5efa9", "metadata": { "tags": [] @@ -120,10 +120,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -178,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "66b67624-3cbe-4401-a297-e008cf18ab0b", "metadata": { "tags": [] @@ -187,15 +187,15 @@ { "data": { "text/plain": [ - "soma_joinid: int64\n", - "obs_id: large_string\n", - "n_genes: int64\n", - "percent_mito: float\n", - "n_counts: float\n", - "louvain: large_string" + "soma_joinid: int64 not null\n", + "obs_id: large_string not null\n", + "n_genes: int64 not null\n", + "percent_mito: float not null\n", + "n_counts: float not null\n", + "louvain: large_string not null" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -231,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "id": "26676c4f-dfb8-4f48-9bc5-1a66ee085f9e", "metadata": { "tags": [] @@ -401,7 +401,7 @@ "[2638 rows x 6 columns]" ] }, - "execution_count": 9, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -432,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "id": "32bfed6c-b0b7-41ed-986c-df7d462498c4", "metadata": { "tags": [] @@ -599,7 +599,7 @@ "10 B cells " ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -620,7 +620,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "id": "703fe8ad-7123-4311-a58b-b00a27c7a483", "metadata": { "tags": [] @@ -726,7 +726,7 @@ "10 AAACTTGAAAAACG-1 1116" ] }, - "execution_count": 11, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -747,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "a5ef3a97-abc3-4d80-ab48-1898fa64d566", "metadata": { "tags": [] @@ -917,7 +917,7 @@ "[75 rows x 6 columns]" ] }, - "execution_count": 12, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -948,7 +948,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "d437b606-8338-4220-966d-59c4bf48fd13", "metadata": { "tags": [] @@ -957,12 +957,12 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -983,7 +983,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "0574abf8-5f72-4a05-a90f-608fdda2db07", "metadata": { "tags": [] @@ -992,12 +992,12 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1038,7 +1038,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 12, "id": "c8c2aa17-52d7-4bd5-a5f3-b58c18fdcb11", "metadata": { "tags": [] @@ -1047,11 +1047,11 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 23, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1071,7 +1071,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 13, "id": "75035918-b26f-48b2-a47b-8ea08c308e37", "metadata": { "tags": [] @@ -1080,10 +1080,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 26, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1105,7 +1105,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 14, "id": "b12b2c0e-db32-48f1-b114-f867baf5be76", "metadata": { "tags": [] @@ -1114,12 +1114,12 @@ { "data": { "text/plain": [ - "soma_dim_0: int64\n", - "soma_dim_1: int64\n", - "soma_data: float" + "soma_dim_0: int64 not null\n", + "soma_dim_1: int64 not null\n", + "soma_data: float not null" ] }, - "execution_count": 28, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1148,7 +1148,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 15, "id": "27858aed-aa89-4c45-bccc-38e9cfa5cbb2", "metadata": { "tags": [] @@ -1160,7 +1160,7 @@ "(2638, 1838)" ] }, - "execution_count": 34, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1183,7 +1183,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 16, "id": "2c586c5c-055b-4bc7-9995-851dd802d961", "metadata": { "tags": [] @@ -1198,7 +1198,7 @@ "strides: (7352, 4)" ] }, - "execution_count": 35, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1219,7 +1219,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 17, "id": "6d90e592-7a67-4a41-af08-05aa3807167a", "metadata": { "tags": [] @@ -1243,7 +1243,7 @@ " -0.13032717, -0.4713379 ]], dtype=float32)" ] }, - "execution_count": 38, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1266,7 +1266,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 18, "id": "4a3b1f45-017d-4b92-9f2e-c88e8e3aa234", "metadata": { "tags": [] @@ -1290,7 +1290,7 @@ " -0.16255915, -0.50339466]], dtype=float32)" ] }, - "execution_count": 48, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1302,7 +1302,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 19, "id": "007f1e15-61cd-40b8-bb23-4102662ab3af", "metadata": { "tags": [] @@ -1314,7 +1314,7 @@ "(10, 1838)" ] }, - "execution_count": 49, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1335,7 +1335,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 20, "id": "d6d39b44-33b3-4cb7-8a34-d30b94899ad1", "metadata": { "tags": [] @@ -1353,7 +1353,7 @@ " [-0.10383061]], dtype=float32)" ] }, - "execution_count": 104, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1407,7 +1407,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "id": "71f8fed4-4ffd-4f30-a5b0-4e3a4a3730f3", "metadata": { "tags": [] @@ -1416,10 +1416,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1440,7 +1440,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "id": "41897a5c-2225-49f9-b9f2-3a68a6ad8079", "metadata": { "tags": [] @@ -1449,12 +1449,12 @@ { "data": { "text/plain": [ - "soma_dim_0: int64\n", - "soma_dim_1: int64\n", - "soma_data: float" + "soma_dim_0: int64 not null\n", + "soma_dim_1: int64 not null\n", + "soma_data: float not null" ] }, - "execution_count": 17, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1475,7 +1475,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 23, "id": "42c1d852-6492-4a5e-b1fe-bc9af3f83639", "metadata": { "tags": [] @@ -1484,10 +1484,10 @@ { "data": { "text/plain": [ - "(9223372036854773760, 9223372036854773760)" + "(2147483646, 2147483646)" ] }, - "execution_count": 43, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1518,7 +1518,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 24, "id": "2862f737-4f08-4886-9496-fe7771b4a581", "metadata": { "tags": [] @@ -1530,7 +1530,7 @@ "4848644" ] }, - "execution_count": 56, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1551,7 +1551,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 25, "id": "eaa0f9aa-8167-4f26-a52f-4d9636dde37b", "metadata": { "tags": [] @@ -1560,10 +1560,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 44, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1582,7 +1582,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 26, "id": "00a7899f-2d28-4f07-b438-ab4d4d6bcfe5", "metadata": { "tags": [] @@ -1721,7 +1721,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index b89f25e666..9f18d7d8e9 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -22,7 +22,7 @@ from ._exception import SOMAError, map_exception_for_create from ._flags import DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN, NEW_SHAPE_FEATURE_FLAG_ENABLED from ._tdb_handles import DenseNDArrayWrapper -from ._types import OpenTimestamp, Slice +from ._types import OpenTimestamp, Slice, StatusAndReason from ._util import dense_indices_to_shape from .options._soma_tiledb_context import ( SOMATileDBContext, @@ -361,6 +361,22 @@ def resize(self, newshape: Sequence[Union[int, None]]) -> None: else: raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0") + def tiledbsoma_upgrade_shape( + self, newshape: Sequence[Union[int, None]], check_only: bool = False + ) -> StatusAndReason: + """Allows the array to have a resizeable shape as described in the TileDB-SOMA + 1.15 release notes. Raises an error if the new shape exceeds maxshape in + any dimension. Raises an error if the array already has a shape. + """ + if NEW_SHAPE_FEATURE_FLAG_ENABLED and DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN: + if check_only: + return self._handle.tiledbsoma_can_upgrade_shape(newshape) + else: + self._handle.tiledbsoma_upgrade_shape(newshape) + return (True, "") + else: + raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0") + @classmethod def _dim_capacity_and_extent( cls, diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index 915cfb67a2..9e160c79b0 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -658,6 +658,24 @@ def tiledbsoma_can_resize( else: raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0") + def tiledbsoma_upgrade_shape(self, newshape: Sequence[Union[int, None]]) -> None: + """Wrapper-class internals""" + if DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN: + self._handle.tiledbsoma_upgrade_shape(newshape) + else: + raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0") + + def tiledbsoma_can_upgrade_shape( + self, newshape: Sequence[Union[int, None]] + ) -> StatusAndReason: + """Wrapper-class internals""" + if DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN: + return cast( + StatusAndReason, self._handle.tiledbsoma_can_upgrade_shape(newshape) + ) + else: + raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0") + class SparseNDArrayWrapper(SOMAArrayWrapper[clib.SOMASparseNDArray]): """Wrapper around a Pybind11 SparseNDArrayWrapper handle.""" diff --git a/apis/python/src/tiledbsoma/io/shaping.py b/apis/python/src/tiledbsoma/io/shaping.py index ddd5e30eac..de847f532f 100644 --- a/apis/python/src/tiledbsoma/io/shaping.py +++ b/apis/python/src/tiledbsoma/io/shaping.py @@ -33,6 +33,18 @@ class SizingArgs(TypedDict): output_handle: Printable +def _find_old_sparse_ndarray_bounds( + snda: tiledbsoma.SparseNDArray, +) -> Tuple[Tuple[int, int], ...]: + # New arrays (created by tiledbsoma 1.15 and above) will have the new shape. + # Older will have used_shape ... + # ... except _really_ old won't even have that. + try: + return snda.used_shape() + except tiledbsoma.SOMAError: + return snda.non_empty_domain() + + def show_experiment_shapes( uri: str, *, @@ -257,7 +269,8 @@ def _leaf_visitor_show_shapes( elif isinstance(item, tiledbsoma.SparseNDArray): _print_leaf_node_banner("SparseNDArray", node_name, item.uri, args) - _bannerize(args, "used_shape", item.used_shape()) + ####_bannerize(args, "used_shape", item.used_shape()) + _bannerize(args, "used_shape", _find_old_sparse_ndarray_bounds(item)) _bannerize(args, "shape", item.shape) _bannerize(args, "maxshape", item.maxshape) _bannerize(args, "upgraded", item.tiledbsoma_has_upgraded_shape) @@ -306,7 +319,8 @@ def _leaf_visitor_upgrade( print(" Already upgraded", file=args["output_handle"]) elif isinstance(item, tiledbsoma.SparseNDArray): - used_shape = item.used_shape() + #### used_shape = item.used_shape() + used_shape = _find_old_sparse_ndarray_bounds(item) new_shape = tuple(e[1] + 1 for e in used_shape) _print_leaf_node_banner("SparseNDArray", node_name, item.uri, args) diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py index 4777a4d6b0..13362da3f1 100644 --- a/apis/python/tests/test_shape.py +++ b/apis/python/tests/test_shape.py @@ -228,6 +228,18 @@ def test_dense_nd_array_basics(tmp_path): else: assert dnda.shape == (100, 200) + if ( + tiledbsoma._flags.DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN + and tiledbsoma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED + ): + with tiledbsoma.DenseNDArray.open(uri) as dnda: + ok, msg = dnda.tiledbsoma_upgrade_shape((600, 700), check_only=True) + assert not ok + assert ( + msg + == "tiledbsoma_can_upgrade_shape: array already has a shape: please use resize" + ) + @pytest.mark.parametrize( "soma_joinid_domain",