diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646982378_1730646982378_3d585af40365b4e37b8f8ba3eb5d397f b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646982378_1730646982378_3d585af40365b4e37b8f8ba3eb5d397f
new file mode 100644
index 0000000000..a1ca32c564
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646982378_1730646982378_3d585af40365b4e37b8f8ba3eb5d397f differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646982386_1730646982386_30f8bc3e229e6b2a077df9af392f5b75 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646982386_1730646982386_30f8bc3e229e6b2a077df9af392f5b75
new file mode 100644
index 0000000000..4527feb154
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646982386_1730646982386_30f8bc3e229e6b2a077df9af392f5b75 differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646982394_1730646982394_14af812a1f2bc292bf6265be3d5c6a46 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646982394_1730646982394_14af812a1f2bc292bf6265be3d5c6a46
new file mode 100644
index 0000000000..a1ca32c564
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646982394_1730646982394_14af812a1f2bc292bf6265be3d5c6a46 differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646982400_1730646982400_6f65cb3d6c3f58482216217921b61181 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646982400_1730646982400_6f65cb3d6c3f58482216217921b61181
new file mode 100644
index 0000000000..a1ca32c564
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646982400_1730646982400_6f65cb3d6c3f58482216217921b61181 differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646982407_1730646982407_73bf9a9a85a4bab5e0243fb2bfe3ef04 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646982407_1730646982407_73bf9a9a85a4bab5e0243fb2bfe3ef04
new file mode 100644
index 0000000000..cc39a7493b
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646982407_1730646982407_73bf9a9a85a4bab5e0243fb2bfe3ef04 differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646982412_1730646982412_3be9315f3bf7a6b41fdc64385428b0c9 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646982412_1730646982412_3be9315f3bf7a6b41fdc64385428b0c9
new file mode 100644
index 0000000000..cc39a7493b
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646982412_1730646982412_3be9315f3bf7a6b41fdc64385428b0c9 differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/var/__schema/__1730645864313_1730645864313_7e3643f24b8a0bd62c5bc3826383b202 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/var/__schema/__1730645864313_1730645864313_7e3643f24b8a0bd62c5bc3826383b202
new file mode 100644
index 0000000000..35af3128b1
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/var/__schema/__1730645864313_1730645864313_7e3643f24b8a0bd62c5bc3826383b202 differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646982418_1730646982418_523922bbaad0be888311e0c354db3b53 b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646982418_1730646982418_523922bbaad0be888311e0c354db3b53
new file mode 100644
index 0000000000..3939bd284a
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646982418_1730646982418_523922bbaad0be888311e0c354db3b53 differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/raw/X/data/__schema/__1730646982429_1730646982429_40368fa8c7242d638bcbbb1bca05ec1f b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/X/data/__schema/__1730646982429_1730646982429_40368fa8c7242d638bcbbb1bca05ec1f
new file mode 100644
index 0000000000..cd0a921f44
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/X/data/__schema/__1730646982429_1730646982429_40368fa8c7242d638bcbbb1bca05ec1f differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/ms/raw/var/__schema/__1730646982424_1730646982424_326d997a982823e2faacaaf703a9eb1e b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/var/__schema/__1730646982424_1730646982424_326d997a982823e2faacaaf703a9eb1e
new file mode 100644
index 0000000000..aa919d163d
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/ms/raw/var/__schema/__1730646982424_1730646982424_326d997a982823e2faacaaf703a9eb1e differ
diff --git a/apis/python/notebooks/data/dense/pbmc3k/obs/__schema/__1730645864297_1730645864297_59235d8e66670248696b378ddf914c84 b/apis/python/notebooks/data/dense/pbmc3k/obs/__schema/__1730645864297_1730645864297_59235d8e66670248696b378ddf914c84
new file mode 100644
index 0000000000..e3350785db
Binary files /dev/null and b/apis/python/notebooks/data/dense/pbmc3k/obs/__schema/__1730645864297_1730645864297_59235d8e66670248696b378ddf914c84 differ
diff --git a/apis/python/notebooks/data/pbmc3k_raw.h5ad b/apis/python/notebooks/data/pbmc3k_raw.h5ad
new file mode 100644
index 0000000000..460ee0b60a
Binary files /dev/null and b/apis/python/notebooks/data/pbmc3k_raw.h5ad differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/X/data/__schema/__1730646967626_1730646967626_67119b59c3aa2e4b2f5285eb14924fb5 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/X/data/__schema/__1730646967626_1730646967626_67119b59c3aa2e4b2f5285eb14924fb5
new file mode 100644
index 0000000000..1fa7fd9632
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/X/data/__schema/__1730646967626_1730646967626_67119b59c3aa2e4b2f5285eb14924fb5 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646967637_1730646967637_3f0aa65f7ea0ca84eb931e62a3f066d2 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646967637_1730646967637_3f0aa65f7ea0ca84eb931e62a3f066d2
new file mode 100644
index 0000000000..a1ca32c564
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_draw_graph_fr/__schema/__1730646967637_1730646967637_3f0aa65f7ea0ca84eb931e62a3f066d2 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646967644_1730646967644_187ad8169f9d5226654947d0afc0cc55 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646967644_1730646967644_187ad8169f9d5226654947d0afc0cc55
new file mode 100644
index 0000000000..4527feb154
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_pca/__schema/__1730646967644_1730646967644_187ad8169f9d5226654947d0afc0cc55 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646967650_1730646967650_45a6de13169ab7508777260a4e4ded09 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646967650_1730646967650_45a6de13169ab7508777260a4e4ded09
new file mode 100644
index 0000000000..a1ca32c564
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_tsne/__schema/__1730646967650_1730646967650_45a6de13169ab7508777260a4e4ded09 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646967656_1730646967656_73cc8ae0b6ae7072842d72a4b2b33352 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646967656_1730646967656_73cc8ae0b6ae7072842d72a4b2b33352
new file mode 100644
index 0000000000..a1ca32c564
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsm/X_umap/__schema/__1730646967656_1730646967656_73cc8ae0b6ae7072842d72a4b2b33352 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646967663_1730646967663_3d149150f3b7c652f8f91820c0e5690b b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646967663_1730646967663_3d149150f3b7c652f8f91820c0e5690b
new file mode 100644
index 0000000000..cc39a7493b
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/connectivities/__schema/__1730646967663_1730646967663_3d149150f3b7c652f8f91820c0e5690b differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646967668_1730646967668_28f39375ad27b80e5c591ad3521afda1 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646967668_1730646967668_28f39375ad27b80e5c591ad3521afda1
new file mode 100644
index 0000000000..cc39a7493b
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/obsp/distances/__schema/__1730646967668_1730646967668_28f39375ad27b80e5c591ad3521afda1 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/var/__schema/__1730645851538_1730645851538_737b2e86bb24ab8b3d9ef856a43ec743 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/var/__schema/__1730645851538_1730645851538_737b2e86bb24ab8b3d9ef856a43ec743
new file mode 100644
index 0000000000..35af3128b1
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/var/__schema/__1730645851538_1730645851538_737b2e86bb24ab8b3d9ef856a43ec743 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646967674_1730646967674_14c5d26b3736039bce3a5b9632724554 b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646967674_1730646967674_14c5d26b3736039bce3a5b9632724554
new file mode 100644
index 0000000000..3939bd284a
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/RNA/varm/PCs/__schema/__1730646967674_1730646967674_14c5d26b3736039bce3a5b9632724554 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/X/data/__schema/__1730646967685_1730646967685_7cf688bc654e41169a4b4040d3518a83 b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/X/data/__schema/__1730646967685_1730646967685_7cf688bc654e41169a4b4040d3518a83
new file mode 100644
index 0000000000..cd0a921f44
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/X/data/__schema/__1730646967685_1730646967685_7cf688bc654e41169a4b4040d3518a83 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/var/__schema/__1730646967680_1730646967680_606ea9e41f3ae67cdb16e9a4cdfc8f67 b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/var/__schema/__1730646967680_1730646967680_606ea9e41f3ae67cdb16e9a4cdfc8f67
new file mode 100644
index 0000000000..aa919d163d
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/ms/raw/var/__schema/__1730646967680_1730646967680_606ea9e41f3ae67cdb16e9a4cdfc8f67 differ
diff --git a/apis/python/notebooks/data/sparse/pbmc3k/obs/__schema/__1730645851522_1730645851522_6624ac044dc93b8508f24f435c18e9e3 b/apis/python/notebooks/data/sparse/pbmc3k/obs/__schema/__1730645851522_1730645851522_6624ac044dc93b8508f24f435c18e9e3
new file mode 100644
index 0000000000..e3350785db
Binary files /dev/null and b/apis/python/notebooks/data/sparse/pbmc3k/obs/__schema/__1730645851522_1730645851522_6624ac044dc93b8508f24f435c18e9e3 differ
diff --git a/apis/python/notebooks/tutorial_exp_query.ipynb b/apis/python/notebooks/tutorial_exp_query.ipynb
index 26f1d1d5e7..5dc8804677 100644
--- a/apis/python/notebooks/tutorial_exp_query.ipynb
+++ b/apis/python/notebooks/tutorial_exp_query.ipynb
@@ -1,296 +1,532 @@
{
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "name": "python",
- "version": "3.9.15",
- "mimetype": "text/x-python",
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "pygments_lexer": "ipython3",
- "nbconvert_exporter": "python",
- "file_extension": ".py"
- }
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2b8e72a7-129c-422c-b955-350fb9ee0541",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Tutorial: SOMA Experiment queries"
+ ]
},
- "nbformat_minor": 5,
- "nbformat": 4,
- "cells": [
- {
- "cell_type": "markdown",
- "source": "# Tutorial: SOMA Experiment queries",
- "metadata": {
- "tags": []
- },
- "id": "2b8e72a7-129c-422c-b955-350fb9ee0541"
- },
- {
- "cell_type": "code",
- "source": "import tiledbsoma as soma",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 3,
- "outputs": [],
- "id": "3a5fd5d3"
- },
- {
- "cell_type": "markdown",
- "source": "In this notebook, we'll take a quick look at the SOMA experiment-query API. The dataset used is from Peripheral Blood Mononuclear Cells (PBMC), which is freely available from 10X Genomics.\n",
- "metadata": {
- "tags": []
- },
- "id": "ccc8709a"
- },
- {
- "cell_type": "code",
- "source": "exp = soma.Experiment.open('data/sparse/pbmc3k')",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 4,
- "outputs": [],
- "id": "9b8851d9-27f1-437b-a070-b41a65a5609e"
- },
- {
- "cell_type": "markdown",
- "source": "Using the keys of the `obs` dataframe, we can see what fields are available to query on.",
- "metadata": {
- "tags": []
- },
- "id": "fab7898c"
- },
- {
- "cell_type": "code",
- "source": "exp.obs.keys()",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 5,
- "outputs": [
- {
- "execution_count": 5,
- "output_type": "execute_result",
- "data": {
- "text/plain": "('soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain')"
- },
- "metadata": {}
- }
- ],
- "id": "d67dfbc6-0382-4acc-8c56-3670549654f8"
- },
- {
- "cell_type": "code",
- "source": "p = exp.obs.read(column_names=['louvain']).concat().to_pandas()\np",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 6,
- "outputs": [
- {
- "execution_count": 6,
- "output_type": "execute_result",
- "data": {
- "text/plain": " louvain\n0 CD4 T cells\n1 B cells\n2 CD4 T cells\n3 CD14+ Monocytes\n4 NK cells\n... ...\n2633 CD14+ Monocytes\n2634 B cells\n2635 B cells\n2636 B cells\n2637 CD4 T cells\n\n[2638 rows x 1 columns]",
- "text/html": "
\n\n
\n \n \n | \n louvain | \n
\n \n \n \n 0 | \n CD4 T cells | \n
\n \n 1 | \n B cells | \n
\n \n 2 | \n CD4 T cells | \n
\n \n 3 | \n CD14+ Monocytes | \n
\n \n 4 | \n NK cells | \n
\n \n ... | \n ... | \n
\n \n 2633 | \n CD14+ Monocytes | \n
\n \n 2634 | \n B cells | \n
\n \n 2635 | \n B cells | \n
\n \n 2636 | \n B cells | \n
\n \n 2637 | \n CD4 T cells | \n
\n \n
\n
2638 rows × 1 columns
\n
"
- },
- "metadata": {}
- }
- ],
- "id": "9e4ede09-2303-4c21-92c1-bf42ed4e7dd1"
- },
- {
- "cell_type": "markdown",
- "source": "Focusing on the `louvain` column, we can now find out what column values are present in the data.",
- "metadata": {
- "tags": []
- },
- "id": "f305fb7c"
- },
- {
- "cell_type": "code",
- "source": "p.groupby('louvain').size().sort_values()",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 7,
- "outputs": [
- {
- "execution_count": 7,
- "output_type": "execute_result",
- "data": {
- "text/plain": "louvain\nMegakaryocytes 15\nDendritic cells 37\nFCGR3A+ Monocytes 150\nNK cells 154\nCD8 T cells 316\nB cells 342\nCD14+ Monocytes 480\nCD4 T cells 1144\ndtype: int64"
- },
- "metadata": {}
- }
- ],
- "id": "00f1ccad-3ee2-4947-8961-8bf9642fbbba"
- },
- {
- "cell_type": "markdown",
- "source": "Now we can query the SOMA experiment -- here, by a few cell types.",
- "metadata": {
- "tags": []
- },
- "id": "fda99535"
- },
- {
- "cell_type": "code",
- "source": "obs_query = soma.AxisQuery(value_filter='louvain in [\"B cells\", \"NK cells\"]')",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 8,
- "outputs": [],
- "id": "e2ed76ca-5821-44c5-a220-ff96568686ec"
- },
- {
- "cell_type": "code",
- "source": "query = exp.axis_query(\"RNA\", obs_query=obs_query)",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 9,
- "outputs": [],
- "id": "f3af70bc-3817-453c-a18c-56dc9aa874da"
- },
- {
- "cell_type": "markdown",
- "source": "Note that the query output is smaller than the original dataset's size -- since we've queried for only a particular pair of cell types.",
- "metadata": {
- "tags": []
- },
- "id": "fb94d898"
- },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "3a5fd5d3",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import tiledbsoma as soma"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ccc8709a",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "In this notebook, we'll take a quick look at the SOMA experiment-query API. The dataset used is from Peripheral Blood Mononuclear Cells (PBMC), which is freely available from 10X Genomics.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "9b8851d9-27f1-437b-a070-b41a65a5609e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "exp = soma.Experiment.open('data/sparse/pbmc3k')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fab7898c",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Using the keys of the `obs` dataframe, we can see what fields are available to query on."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "d67dfbc6-0382-4acc-8c56-3670549654f8",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "source": "[exp.obs.count, exp.ms[\"RNA\"].var.count]",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 10,
- "outputs": [
- {
- "execution_count": 10,
- "output_type": "execute_result",
- "data": {
- "text/plain": "[2638, 1838]"
- },
- "metadata": {}
- }
- ],
- "id": "2c60568b-0789-4dbf-aff9-4bea2860aef4"
- },
+ "data": {
+ "text/plain": [
+ "('soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain')"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "exp.obs.keys()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "9e4ede09-2303-4c21-92c1-bf42ed4e7dd1",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "source": "[query.n_obs, query.n_vars]",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 11,
- "outputs": [
- {
- "execution_count": 11,
- "output_type": "execute_result",
- "data": {
- "text/plain": "[496, 1838]"
- },
- "metadata": {}
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " louvain | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CD4 T cells | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " B cells | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CD4 T cells | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CD14+ Monocytes | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " NK cells | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2633 | \n",
+ " CD14+ Monocytes | \n",
+ "
\n",
+ " \n",
+ " 2634 | \n",
+ " B cells | \n",
+ "
\n",
+ " \n",
+ " 2635 | \n",
+ " B cells | \n",
+ "
\n",
+ " \n",
+ " 2636 | \n",
+ " B cells | \n",
+ "
\n",
+ " \n",
+ " 2637 | \n",
+ " CD4 T cells | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2638 rows × 1 columns
\n",
+ "
"
],
- "id": "28ed8d40-36c5-4642-bd8f-53d35c3074f0"
- },
- {
- "cell_type": "markdown",
- "source": "Here we can take a look at the X data.",
- "metadata": {
- "tags": []
- },
- "id": "c9625771"
- },
+ "text/plain": [
+ " louvain\n",
+ "0 CD4 T cells\n",
+ "1 B cells\n",
+ "2 CD4 T cells\n",
+ "3 CD14+ Monocytes\n",
+ "4 NK cells\n",
+ "... ...\n",
+ "2633 CD14+ Monocytes\n",
+ "2634 B cells\n",
+ "2635 B cells\n",
+ "2636 B cells\n",
+ "2637 CD4 T cells\n",
+ "\n",
+ "[2638 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p = exp.obs.read(column_names=['louvain']).concat().to_pandas()\n",
+ "p"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f305fb7c",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Focusing on the `louvain` column, we can now find out what column values are present in the data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "00f1ccad-3ee2-4947-8961-8bf9642fbbba",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "source": "query.X(\"data\").tables().concat().to_pandas()",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 12,
- "outputs": [
- {
- "execution_count": 12,
- "output_type": "execute_result",
- "data": {
- "text/plain": " soma_dim_0 soma_dim_1 soma_data\n0 1 0 -0.214582\n1 1 1 -0.372653\n2 1 2 -0.054804\n3 1 3 -0.683391\n4 1 4 0.633951\n... ... ... ...\n911643 2636 1833 -0.149789\n911644 2636 1834 -0.325824\n911645 2636 1835 -0.005918\n911646 2636 1836 -0.135213\n911647 2636 1837 -0.482111\n\n[911648 rows x 3 columns]",
- "text/html": "\n\n
\n \n \n | \n soma_dim_0 | \n soma_dim_1 | \n soma_data | \n
\n \n \n \n 0 | \n 1 | \n 0 | \n -0.214582 | \n
\n \n 1 | \n 1 | \n 1 | \n -0.372653 | \n
\n \n 2 | \n 1 | \n 2 | \n -0.054804 | \n
\n \n 3 | \n 1 | \n 3 | \n -0.683391 | \n
\n \n 4 | \n 1 | \n 4 | \n 0.633951 | \n
\n \n ... | \n ... | \n ... | \n ... | \n
\n \n 911643 | \n 2636 | \n 1833 | \n -0.149789 | \n
\n \n 911644 | \n 2636 | \n 1834 | \n -0.325824 | \n
\n \n 911645 | \n 2636 | \n 1835 | \n -0.005918 | \n
\n \n 911646 | \n 2636 | \n 1836 | \n -0.135213 | \n
\n \n 911647 | \n 2636 | \n 1837 | \n -0.482111 | \n
\n \n
\n
911648 rows × 3 columns
\n
"
- },
- "metadata": {}
- }
- ],
- "id": "65063167-5015-497a-9712-d72c0ecac2ed"
- },
+ "data": {
+ "text/plain": [
+ "louvain\n",
+ "Megakaryocytes 15\n",
+ "Dendritic cells 37\n",
+ "FCGR3A+ Monocytes 150\n",
+ "NK cells 154\n",
+ "CD8 T cells 316\n",
+ "B cells 342\n",
+ "CD14+ Monocytes 480\n",
+ "CD4 T cells 1144\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "p.groupby('louvain').size().sort_values()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fda99535",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Now we can query the SOMA experiment -- here, by a few cell types."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "e2ed76ca-5821-44c5-a220-ff96568686ec",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "obs_query = soma.AxisQuery(value_filter='louvain in [\"B cells\", \"NK cells\"]')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "f3af70bc-3817-453c-a18c-56dc9aa874da",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "query = exp.axis_query(\"RNA\", obs_query=obs_query)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb94d898",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Note that the query output is smaller than the original dataset's size -- since we've queried for only a particular pair of cell types."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "2c60568b-0789-4dbf-aff9-4bea2860aef4",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "source": "To finish out this introductory look at the experiment-query API, we can convert our query outputs to AnnData format.",
- "metadata": {
- "tags": []
- },
- "id": "db7af8b8"
- },
+ "data": {
+ "text/plain": [
+ "[2638, 1838]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[exp.obs.count, exp.ms[\"RNA\"].var.count]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "28ed8d40-36c5-4642-bd8f-53d35c3074f0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "source": "adata = query.to_anndata(X_name=\"data\")",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 13,
- "outputs": [],
- "id": "1ed8510b-343a-4f88-8aae-11a5c2069311"
- },
+ "data": {
+ "text/plain": [
+ "[496, 1838]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[query.n_obs, query.n_vars]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c9625771",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Here we can take a look at the X data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "65063167-5015-497a-9712-d72c0ecac2ed",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "source": "adata",
- "metadata": {
- "tags": [],
- "trusted": true
- },
- "execution_count": 14,
- "outputs": [
- {
- "execution_count": 14,
- "output_type": "execute_result",
- "data": {
- "text/plain": "AnnData object with n_obs × n_vars = 496 × 1838\n obs: 'soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain'\n var: 'soma_joinid', 'var_id', 'n_cells'"
- },
- "metadata": {}
- }
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " soma_dim_0 | \n",
+ " soma_dim_1 | \n",
+ " soma_data | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " -0.214582 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " -0.372653 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " -0.054804 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " -0.683391 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 0.633951 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 911643 | \n",
+ " 2636 | \n",
+ " 1833 | \n",
+ " -0.149789 | \n",
+ "
\n",
+ " \n",
+ " 911644 | \n",
+ " 2636 | \n",
+ " 1834 | \n",
+ " -0.325824 | \n",
+ "
\n",
+ " \n",
+ " 911645 | \n",
+ " 2636 | \n",
+ " 1835 | \n",
+ " -0.005918 | \n",
+ "
\n",
+ " \n",
+ " 911646 | \n",
+ " 2636 | \n",
+ " 1836 | \n",
+ " -0.135213 | \n",
+ "
\n",
+ " \n",
+ " 911647 | \n",
+ " 2636 | \n",
+ " 1837 | \n",
+ " -0.482111 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
911648 rows × 3 columns
\n",
+ "
"
],
- "id": "b3118504-8c92-48d4-9b83-87176960e4f1"
- },
+ "text/plain": [
+ " soma_dim_0 soma_dim_1 soma_data\n",
+ "0 1 0 -0.214582\n",
+ "1 1 1 -0.372653\n",
+ "2 1 2 -0.054804\n",
+ "3 1 3 -0.683391\n",
+ "4 1 4 0.633951\n",
+ "... ... ... ...\n",
+ "911643 2636 1833 -0.149789\n",
+ "911644 2636 1834 -0.325824\n",
+ "911645 2636 1835 -0.005918\n",
+ "911646 2636 1836 -0.135213\n",
+ "911647 2636 1837 -0.482111\n",
+ "\n",
+ "[911648 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "query.X(\"data\").tables().concat().to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db7af8b8",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "To finish out this introductory look at the experiment-query API, we can convert our query outputs to AnnData format."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "1ed8510b-343a-4f88-8aae-11a5c2069311",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "adata = query.to_anndata(X_name=\"data\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "b3118504-8c92-48d4-9b83-87176960e4f1",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "source": "",
- "metadata": {
- "trusted": true
- },
- "execution_count": null,
- "outputs": [],
- "id": "f2e46ce1-cf7a-43c2-9f0d-bf918fd806bc"
+ "data": {
+ "text/plain": [
+ "AnnData object with n_obs × n_vars = 496 × 1838\n",
+ " obs: 'soma_joinid', 'obs_id', 'n_genes', 'percent_mito', 'n_counts', 'louvain'\n",
+ " var: 'soma_joinid', 'var_id', 'n_cells'"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
}
- ]
+ ],
+ "source": [
+ "adata"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/apis/python/notebooks/tutorial_soma_append_mode.ipynb b/apis/python/notebooks/tutorial_soma_append_mode.ipynb
index d33d12f0f1..a69a40a0f6 100644
--- a/apis/python/notebooks/tutorial_soma_append_mode.ipynb
+++ b/apis/python/notebooks/tutorial_soma_append_mode.ipynb
@@ -1,1150 +1,1261 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "# Tutorial: TileDB-SOMA append-mode"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "As of TileDB-SOMA 1.5.0, we're excited to offer support for append mode.\n",
- "\n",
- "Use-cases include ingesting H5AD/AnnData from multiple sequencing runs over time, accumulating the data over time, into millions of cells."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "First, we'll do the usual package imports:"
- ]
- },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "36a0b22b",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Tutorial: TileDB-SOMA append-mode"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69de8627",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "As of TileDB-SOMA 1.5.0, we're excited to offer support for append mode.\n",
+ "\n",
+ "Use-cases include ingesting H5AD/AnnData from multiple sequencing runs over time, accumulating the data over time, into millions of cells."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2a218461",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "First, we'll do the usual package imports:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "d6b81174",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "tiledbsoma.__version__ 1.11.1\n",
- "TileDB-Py version 0.29.0\n",
- "TileDB core version (tiledb) 2.23.0\n",
- "TileDB core version (libtiledbsoma) 2.23.0\n",
- "python version 3.11.8.final.0\n",
- "OS version Linux 4.14.343-261.564.amzn2.x86_64\n"
- ]
- }
- ],
- "source": [
- "import scanpy as sc\n",
- "import tiledbsoma\n",
- "import tiledbsoma.io\n",
- "import tiledbsoma.logging\n",
- "\n",
- "tiledbsoma.show_package_versions()"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "tiledbsoma.__version__ 1.15.0rc0.post92.dev1796808282\n",
+ "TileDB core version (libtiledbsoma) 2.27.0\n",
+ "python version 3.11.9.final.0\n",
+ "OS version Darwin 24.1.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import scanpy as sc\n",
+ "import tiledbsoma\n",
+ "import tiledbsoma.io\n",
+ "import tiledbsoma.logging\n",
+ "\n",
+ "tiledbsoma.show_package_versions()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7a65011",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Next we'll set up where our data are going:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "24108e1c",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Next we'll set up where our data are going:"
+ "data": {
+ "text/plain": [
+ "'/tmp/append-example-20241103-094551'"
]
- },
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import datetime\n",
+ "\n",
+ "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n",
+ "experiment_uri = f\"/tmp/append-example-{stamp}\"\n",
+ "experiment_uri"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e835c440",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "For this demo, we're writing to `/tmp`, but URIs like the following allow storing data on TileDB Cloud, cloud storage such as S3, or instance-local NVME:\n",
+ "\n",
+ "- `/var/data/mysoma1`\n",
+ "- `s3://mybucket/mysoma2`\n",
+ "- `tiledb://mynamespace/s3://mybucket/mysoma3`\n",
+ "\n",
+ "Everything in this notebook below this URI-selection cell is agnostic to the storage backend."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ffee7b3",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Create the initial SOMA Experiment"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bb3aa6c8",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Next we'll prep some input data. To make things easy for this self-contained demo, we'll use Scanpy's `pbmc3k`, with a custom column."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "fe0e7a46",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'/tmp/append-example-20240521-145833'"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import datetime\n",
- "\n",
- "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n",
- "experiment_uri = f\"/tmp/append-example-{stamp}\"\n",
- "experiment_uri"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████████████████████████████████████████████████████████████████████████| 5.58M/5.58M [00:01<00:00, 4.18MB/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "ad1 = sc.datasets.pbmc3k()\n",
+ "sc.pp.calculate_qc_metrics(ad1, inplace=True)\n",
+ "ad1.obs[\"when\"] = [\"Monday\"] * len(ad1.obs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "88af955c",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Now we're ready to ingest the data into a SOMA experiment. Since SOMA is multimodal, we'll specify the destination modality, or measurement name, to be \"RNA\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "10cbd82b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "measurement_name = \"RNA\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "a7c7914f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "For this demo, we're writing to `/tmp`, but URIs like the following allow storing data on TileDB Cloud, cloud storage such as S3, or instance-local NVME:\n",
- "\n",
- "- `/var/data/mysoma1`\n",
- "- `s3://mybucket/mysoma2`\n",
- "- `tiledb://mynamespace/s3://mybucket/mysoma3`\n",
- "\n",
- "Everything in this notebook below this URI-selection cell is agnostic to the storage backend."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Registration: registering isolated AnnData object.\n",
+ "Wrote /tmp/append-example-20241103-094551/obs\n",
+ "Wrote /tmp/append-example-20241103-094551/ms/RNA/var\n",
+ "Writing /tmp/append-example-20241103-094551/ms/RNA/X/data\n",
+ "Wrote /tmp/append-example-20241103-094551/ms/RNA/X/data\n",
+ "Wrote /tmp/append-example-20241103-094551\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Create the initial SOMA Experiment"
+ "data": {
+ "text/plain": [
+ "'/tmp/append-example-20241103-094551'"
]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Next we'll prep some input data. To make things easy for this self-contained demo, we'll use Scanpy's `pbmc3k`, with a custom column."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":119: FutureWarning: SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset.\n",
+ "\n",
+ "For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.\n",
+ "\n",
+ "For creation, use `anndata.experimental.sparse_dataset(X)` instead.\n",
+ "\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "ad1 = sc.datasets.pbmc3k()\n",
- "sc.pp.calculate_qc_metrics(ad1, inplace=True)\n",
- "ad1.obs[\"when\"] = [\"Monday\"] * len(ad1.obs)"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Now we're ready to ingest the data into a SOMA experiment. Since SOMA is multimodal, we'll specify the destination modality, or measurement name, to be \"RNA\"."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145833\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "measurement_name = \"RNA\""
+ "data": {
+ "text/plain": [
+ "'/tmp/append-example-20240521-145833'"
]
- },
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tiledbsoma.logging.info()\n",
+ "tiledbsoma.io.from_anndata(experiment_uri, ad1, measurement_name=measurement_name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c993ff5",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Now let's read back the data. We'll take a look at `obs`, `var`, and `X`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "40c6b6f0",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "**obs**: For this initial ingest, there are obs IDs ending in `-1`, the `when` is `Monday`, and there are 2700 rows. Also note that since TileDB is a columnar database, when we select certain columns, those are the only ones loaded from disk. This positively impacts performance at cloud scale."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "d6ca5c9e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: registering isolated AnnData object.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833/obs\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833/ms/RNA/var\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Writing /tmp/append-example-20240521-145833/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- ":119: FutureWarning: SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset.\n",
- "\n",
- "For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.\n",
- "\n",
- "For creation, use `anndata.experimental.sparse_dataset(X)` instead.\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "'/tmp/append-example-20240521-145833'"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tiledbsoma.logging.info()\n",
- "tiledbsoma.io.from_anndata(experiment_uri, ad1, measurement_name=measurement_name)"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " obs_id n_genes_by_counts when\n",
+ "0 AAACATACAACCAC-1 781 Monday\n",
+ "1 AAACATTGAGCTAC-1 1352 Monday\n",
+ "2 AAACATTGATCAGC-1 1131 Monday\n",
+ "3 AAACCGTGCTTCCG-1 960 Monday\n",
+ "4 AAACCGTGTATGCG-1 522 Monday\n",
+ "... ... ... ...\n",
+ "2695 TTTCGAACTCTCAT-1 1155 Monday\n",
+ "2696 TTTCTACTGAGGCA-1 1227 Monday\n",
+ "2697 TTTCTACTTCCTCG-1 622 Monday\n",
+ "2698 TTTGCATGAGAGGC-1 454 Monday\n",
+ "2699 TTTGCATGCCTCAC-1 724 Monday\n",
+ "\n",
+ "[2700 rows x 3 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " print(\n",
+ " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
+ " .concat()\n",
+ " .to_pandas()\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b8610b39",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "**var**: Let's also look at `var`, selecting out the join IDs (which index columns of `X`) as well as the Ensembl-format and NCBI-format gene IDs:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "221c472f",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Now let's read back the data. We'll take a look at `obs`, `var`, and `X`."
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " soma_joinid var_id gene_ids\n",
+ "0 0 MIR1302-10 ENSG00000243485\n",
+ "1 1 FAM138A ENSG00000237613\n",
+ "2 2 OR4F5 ENSG00000186092\n",
+ "3 3 RP11-34P13.7 ENSG00000238009\n",
+ "4 4 RP11-34P13.8 ENSG00000239945\n",
+ "... ... ... ...\n",
+ "32733 32733 AC145205.1 ENSG00000215635\n",
+ "32734 32734 BAGE5 ENSG00000268590\n",
+ "32735 32735 CU459201.1 ENSG00000251180\n",
+ "32736 32736 AC002321.2 ENSG00000215616\n",
+ "32737 32737 AC002321.1 ENSG00000215611\n",
+ "\n",
+ "[32738 rows x 3 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " print(\n",
+ " exp.ms[\"RNA\"]\n",
+ " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
+ " .concat()\n",
+ " .to_pandas()\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2e74cd29",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "**X**: Lastly let's look at the expression matrix, in COO format. (You can convert to other formats if you like.) Its rows and columns are indexed by the `soma_joinid` of the `obs` and `var` dataframes, respectively."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "69ba087b-cb5f-4851-8ee6-3a4d828d70a6",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " X = exp.ms[\"RNA\"].X[\"data\"]\n",
+ " print(X.read().tables().concat().to_pandas())\n",
+ " print()\n",
+ " print(X.used_shape())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "bc5b5740-79a4-4831-8549-f06ac079fb02",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "**obs**: For this initial ingest, there are obs IDs ending in `-1`, the `when` is `Monday`, and there are 2700 rows. Also note that since TileDB is a columnar database, when we select certain columns, those are the only ones loaded from disk. This positively impacts performance at cloud scale."
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "[DataFrame] obs \n",
+ " URI file:///tmp/append-example-20241103-094551/obs\n",
+ " count 2700\n",
+ " domain ((0, 2699),)\n",
+ " maxdomain ((0, 9223372036854773758),)\n",
+ " upgraded True\n",
+ "\n",
+ "[DataFrame] ms/RNA/var \n",
+ " URI file:///tmp/append-example-20241103-094551/ms/RNA/var\n",
+ " count 32738\n",
+ " domain ((0, 32737),)\n",
+ " maxdomain ((0, 9223372036854773758),)\n",
+ " upgraded True\n",
+ "\n",
+ "[SparseNDArray] ms/RNA/X/data \n",
+ " URI file:///tmp/append-example-20241103-094551/ms/RNA/X/data\n",
+ " used_shape ((0, 2699), (0, 32732))\n",
+ " shape (2700, 32738)\n",
+ " maxshape (9223372036854773759, 9223372036854773759)\n",
+ " upgraded True\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " obs_id n_genes_by_counts when\n",
- "0 AAACATACAACCAC-1 781 Monday\n",
- "1 AAACATTGAGCTAC-1 1352 Monday\n",
- "2 AAACATTGATCAGC-1 1131 Monday\n",
- "3 AAACCGTGCTTCCG-1 960 Monday\n",
- "4 AAACCGTGTATGCG-1 522 Monday\n",
- "... ... ... ...\n",
- "2695 TTTCGAACTCTCAT-1 1155 Monday\n",
- "2696 TTTCTACTGAGGCA-1 1227 Monday\n",
- "2697 TTTCTACTTCCTCG-1 622 Monday\n",
- "2698 TTTGCATGAGAGGC-1 454 Monday\n",
- "2699 TTTGCATGCCTCAC-1 724 Monday\n",
- "\n",
- "[2700 rows x 3 columns]\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " print(\n",
- " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
- " .concat()\n",
- " .to_pandas()\n",
- " )"
+ "data": {
+ "text/plain": [
+ "True"
]
- },
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tiledbsoma.io.show_experiment_shapes(exp.uri)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd08018e",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Appending a new dataset to the SOMA Experiment"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10f03631",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Now, let's simiulate another day's sequencing run. For simplicity of this demo notebook, we'll mutate the previous dataset, changing the obs IDs to have a `-2` suffix, and also putting `Tuesday` in the `when` column. Also, we'll multiply the `X` values by 10."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "81ca4031",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "ad2 = ad1.copy()\n",
+ "ad2.obs.index = [e.replace(\"-1\", \"-2\") for e in ad1.obs.index]\n",
+ "ad2.obs[\"when\"] = [\"Tuesday\"] * len(ad2.obs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "d703ebb7",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "ad2.X *= 10"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "90e85660",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Now we simply ingest as before -- the only additional step is a black-box registration step which detects which cell IDs are new (here, all of them) and which gene IDs are new (here, none of them).\n",
+ "\n",
+ "The registration takes two forms, either of which you can use depending on your use-case: `tiledbsoma.io.register_anndatas` for in-memory AnnData objects, or `tiledbsoma.io.register_h5ads` for on-storage AnnData objects."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dd0cf6c9-cd3c-4faf-84f5-8659cac7c4f5",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# XXX TEMP\n",
+ "tiledbsoma.io.resize_experiment(exp.uri, nobs=rd.get_obs_shape(), nvars=rd.get_var_shapes())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "f183088d-b428-47ce-99f6-c12157867357",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "**var**: Let's also look at `var`, selecting out the join IDs (which index columns of `X`) as well as the Ensembl-format and NCBI-format gene IDs:"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "[DataFrame] obs \n",
+ " URI file:///tmp/append-example-20241103-094551/obs\n",
+ " count 5400\n",
+ " domain ((0, 5399),)\n",
+ " maxdomain ((0, 9223372036854773758),)\n",
+ " upgraded True\n",
+ "\n",
+ "[DataFrame] ms/RNA/var \n",
+ " URI file:///tmp/append-example-20241103-094551/ms/RNA/var\n",
+ " count 32738\n",
+ " domain ((0, 32737),)\n",
+ " maxdomain ((0, 9223372036854773758),)\n",
+ " upgraded True\n",
+ "\n",
+ "[SparseNDArray] ms/RNA/X/data \n",
+ " URI file:///tmp/append-example-20241103-094551/ms/RNA/X/data\n",
+ " used_shape ((0, 5399), (0, 32732))\n",
+ " shape (5400, 32738)\n",
+ " maxshape (9223372036854773759, 9223372036854773759)\n",
+ " upgraded True\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " soma_joinid var_id gene_ids\n",
- "0 0 MIR1302-10 ENSG00000243485\n",
- "1 1 FAM138A ENSG00000237613\n",
- "2 2 OR4F5 ENSG00000186092\n",
- "3 3 RP11-34P13.7 ENSG00000238009\n",
- "4 4 RP11-34P13.8 ENSG00000239945\n",
- "... ... ... ...\n",
- "32733 32733 AC145205.1 ENSG00000215635\n",
- "32734 32734 BAGE5 ENSG00000268590\n",
- "32735 32735 CU459201.1 ENSG00000251180\n",
- "32736 32736 AC002321.2 ENSG00000215616\n",
- "32737 32737 AC002321.1 ENSG00000215611\n",
- "\n",
- "[32738 rows x 3 columns]\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " print(\n",
- " exp.ms[\"RNA\"]\n",
- " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
- " .concat()\n",
- " .to_pandas()\n",
- " )"
+ "data": {
+ "text/plain": [
+ "True"
]
- },
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# XXX TEMP\n",
+ "tiledbsoma.io.show_experiment_shapes(exp.uri)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "2ee7c0e4-eb36-482e-a37e-7c786607b76a",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "**X**: Lastly let's look at the expression matrix, in COO format. (You can convert to other formats if you like.) Its rows and columns are indexed by the `soma_joinid` of the `obs` and `var` dataframes, respectively."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Registration: starting with experiment /tmp/append-example-20241103-094551\n",
+ "Registration: found nobs=2700 nvar=32738 from experiment.\n",
+ "Registration: registering AnnData object.\n",
+ "Registration: accumulated to nobs=5400 nvar=32738.\n",
+ "Registration: complete.\n",
+ "Wrote /tmp/append-example-20241103-094551/obs\n",
+ "Wrote /tmp/append-example-20241103-094551/ms/RNA/var\n",
+ "Writing /tmp/append-example-20241103-094551/ms/RNA/X/data\n",
+ "Wrote /tmp/append-example-20241103-094551/ms/RNA/X/data\n",
+ "Wrote file:///tmp/append-example-20241103-094551\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " soma_dim_0 soma_dim_1 soma_data\n",
- "0 0 70 1.0\n",
- "1 0 166 1.0\n",
- "2 0 178 2.0\n",
- "3 0 326 1.0\n",
- "4 0 363 1.0\n",
- "... ... ... ...\n",
- "2286879 2699 32697 1.0\n",
- "2286880 2699 32698 7.0\n",
- "2286881 2699 32702 1.0\n",
- "2286882 2699 32705 1.0\n",
- "2286883 2699 32708 3.0\n",
- "\n",
- "[2286884 rows x 3 columns]\n",
- "\n",
- "((0, 2699), (0, 32732))\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " X = exp.ms[\"RNA\"].X[\"data\"]\n",
- " print(X.read().tables().concat().to_pandas())\n",
- " print()\n",
- " print(X.used_shape())"
+ "data": {
+ "text/plain": [
+ "'file:///tmp/append-example-20241103-094551'"
]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Appending a new dataset to the SOMA Experiment"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Now, let's simiulate another day's sequencing run. For simplicity of this demo notebook, we'll mutate the previous dataset, changing the obs IDs to have a `-2` suffix, and also putting `Tuesday` in the `when` column. Also, we'll multiply the `X` values by 10."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145833\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "ad2 = ad1.copy()\n",
- "ad2.obs.index = [e.replace(\"-1\", \"-2\") for e in ad1.obs.index]\n",
- "ad2.obs[\"when\"] = [\"Tuesday\"] * len(ad2.obs)"
+ "data": {
+ "text/plain": [
+ "'/tmp/append-example-20240521-145833'"
]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "ad2.X *= 10"
- ]
- },
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rd = tiledbsoma.io.register_anndatas(\n",
+ " experiment_uri,\n",
+ " [ad2],\n",
+ " measurement_name=measurement_name,\n",
+ " obs_field_name=\"obs_id\",\n",
+ " var_field_name=\"var_id\",\n",
+ ")\n",
+ "\n",
+ "tiledbsoma.io.from_anndata(\n",
+ " experiment_uri,\n",
+ " ad2,\n",
+ " measurement_name=measurement_name,\n",
+ " registration_mapping=rd,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53d07733",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Now let's read back the appended data. There are now obs IDs ending in `-1` as well as `-2`, the `when` includes `Monday` as well as `Tuesday`, and there are 5400 rows.\n",
+ "\n",
+ "(For `Wednesday` and onward, it'll simply be the same pattern -- we can grow our data iteratively over time, to arbitrary sizes.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "a7b2aebe",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Now we simply ingest as before -- the only additional step is a black-box registration step which detects which cell IDs are new (here, all of them) and which gene IDs are new (here, none of them).\n",
- "\n",
- "The registration takes two forms, either of which you can use depending on your use-case: `tiledbsoma.io.register_anndatas` for in-memory AnnData objects, or `tiledbsoma.io.register_h5ads` for on-storage AnnData objects."
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " obs_id n_genes_by_counts when\n",
+ "0 AAACATACAACCAC-1 781 Monday\n",
+ "1 AAACATTGAGCTAC-1 1352 Monday\n",
+ "2 AAACATTGATCAGC-1 1131 Monday\n",
+ "3 AAACCGTGCTTCCG-1 960 Monday\n",
+ "4 AAACCGTGTATGCG-1 522 Monday\n",
+ "... ... ... ...\n",
+ "5395 TTTCGAACTCTCAT-2 1155 Tuesday\n",
+ "5396 TTTCTACTGAGGCA-2 1227 Tuesday\n",
+ "5397 TTTCTACTTCCTCG-2 622 Tuesday\n",
+ "5398 TTTGCATGAGAGGC-2 454 Tuesday\n",
+ "5399 TTTGCATGCCTCAC-2 724 Tuesday\n",
+ "\n",
+ "[5400 rows x 3 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " print(\n",
+ " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
+ " .concat()\n",
+ " .to_pandas()\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c8bc7cd2",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Let's also look at `var`, as before. Since we had data for more cells but for the same genes, there is nothing new here. The `obs` table grew downward with the new cells, and `X` grew downward with new rows, but `var` stayed the same.\n",
+ "\n",
+ "In real-world data, occasionally you will see a gene expressed in subsequent data which wasn't expressed in the initial data. That's fine -- you'll simply see `var` grow just a bit for those newly encountered gene IDs, with corresponding new columns for `X`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "4a1cc20e",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: starting with experiment /tmp/append-example-20240521-145833\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: found nobs=2700 nvar=32738 from experiment.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: registering AnnData object.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: accumulated to nobs=5400 nvar=32738.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: complete.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833/obs\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833/ms/RNA/var\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Writing /tmp/append-example-20240521-145833/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145833\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "'/tmp/append-example-20240521-145833'"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "rd = tiledbsoma.io.register_anndatas(\n",
- " experiment_uri,\n",
- " [ad2],\n",
- " measurement_name=measurement_name,\n",
- " obs_field_name=\"obs_id\",\n",
- " var_field_name=\"var_id\",\n",
- ")\n",
- "\n",
- "tiledbsoma.io.from_anndata(\n",
- " experiment_uri,\n",
- " ad2,\n",
- " measurement_name=measurement_name,\n",
- " registration_mapping=rd,\n",
- ")"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " soma_joinid var_id gene_ids\n",
+ "0 0 MIR1302-10 ENSG00000243485\n",
+ "1 1 FAM138A ENSG00000237613\n",
+ "2 2 OR4F5 ENSG00000186092\n",
+ "3 3 RP11-34P13.7 ENSG00000238009\n",
+ "4 4 RP11-34P13.8 ENSG00000239945\n",
+ "... ... ... ...\n",
+ "32733 32733 AC145205.1 ENSG00000215635\n",
+ "32734 32734 BAGE5 ENSG00000268590\n",
+ "32735 32735 CU459201.1 ENSG00000251180\n",
+ "32736 32736 AC002321.2 ENSG00000215616\n",
+ "32737 32737 AC002321.1 ENSG00000215611\n",
+ "\n",
+ "[32738 rows x 3 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " print(\n",
+ " exp.ms[\"RNA\"]\n",
+ " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
+ " .concat()\n",
+ " .to_pandas()\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "499785d6",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "And lastly, the `X` expression matrix which has grown downward with the new cells, while keeping the same width as we didn't introduce new genes:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "d640bde0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Now let's read back the appended data. There are now obs IDs ending in `-1` as well as `-2`, the `when` includes `Monday` as well as `Tuesday`, and there are 5400 rows.\n",
- "\n",
- "(For `Wednesday` and onward, it'll simply be the same pattern -- we can grow our data iteratively over time, to arbitrary sizes.)"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " soma_dim_0 soma_dim_1 soma_data\n",
+ "0 0 70 1.0\n",
+ "1 0 166 1.0\n",
+ "2 0 178 2.0\n",
+ "3 0 326 1.0\n",
+ "4 0 363 1.0\n",
+ "... ... ... ...\n",
+ "4573763 5399 32697 10.0\n",
+ "4573764 5399 32698 70.0\n",
+ "4573765 5399 32702 10.0\n",
+ "4573766 5399 32705 10.0\n",
+ "4573767 5399 32708 30.0\n",
+ "\n",
+ "[4573768 rows x 3 columns]\n",
+ "\n",
+ "((0, 5399), (0, 32732))\n"
+ ]
+ }
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " X = exp.ms[\"RNA\"].X[\"data\"]\n",
+ " print(X.read().tables().concat().to_pandas())\n",
+ " print()\n",
+ " print(X.used_shape())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "290da7f2",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "## Ingesting multiple datasets to a SOMA Experiment"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5d812c64",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Finally, we'll demonstrate combining multiple AnnDatas into one new experiment.\n",
+ "\n",
+ "The flow is pretty similar to the above:\n",
+ "\n",
+ "1. One call to `register_anndatas` or `register_h5ads` (passing all input AnnDatas/h5ads)\n",
+ "2. One call to `from_anndata`/`from_h5ad` *for each input AnnData*\n",
+ "\n",
+ "Here's a helper function to simulate multiple lab runs. As above, where we used `pbmc3k` to simulate Monday and Tuesday data, here we use `pbmc3k` to simulate multiple AnnData objects."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "c3c185fb",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def make_ad(when, scale, obs_id_suffix):\n",
+ " ad = ad1.copy()\n",
+ " ad.obs.index = [e.replace(\"-1\", obs_id_suffix) for e in ad.obs.index]\n",
+ " ad.obs[\"when\"] = [when] * len(ad.obs)\n",
+ " ad.X *= scale\n",
+ " return ad\n",
+ "\n",
+ "ads = [\n",
+ " make_ad(when, scale, f\"-{idx + 3}\")\n",
+ " for idx, (when, scale)\n",
+ " in enumerate({\n",
+ " \"Wednesday\": 20,\n",
+ " \"Thursday\": 30,\n",
+ " \"Friday\": 40,\n",
+ " }.items())\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7da62a10",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "We'll ingest these AnnData objects, as before, but this time to a fresh/empty `/tmp` location:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "ae2d62ae",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " obs_id n_genes_by_counts when\n",
- "0 AAACATACAACCAC-1 781 Monday\n",
- "1 AAACATTGAGCTAC-1 1352 Monday\n",
- "2 AAACATTGATCAGC-1 1131 Monday\n",
- "3 AAACCGTGCTTCCG-1 960 Monday\n",
- "4 AAACCGTGTATGCG-1 522 Monday\n",
- "... ... ... ...\n",
- "5395 TTTCGAACTCTCAT-2 1155 Tuesday\n",
- "5396 TTTCTACTGAGGCA-2 1227 Tuesday\n",
- "5397 TTTCTACTTCCTCG-2 622 Tuesday\n",
- "5398 TTTGCATGAGAGGC-2 454 Tuesday\n",
- "5399 TTTGCATGCCTCAC-2 724 Tuesday\n",
- "\n",
- "[5400 rows x 3 columns]\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " print(\n",
- " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
- " .concat()\n",
- " .to_pandas()\n",
- " )"
+ "data": {
+ "text/plain": [
+ "'/tmp/append-example-20241103-095132'"
]
- },
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n",
+ "experiment_uri = f\"/tmp/append-example-{stamp}\"\n",
+ "experiment_uri"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b89fe0ee",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Here we'll register all the AnnData objects. Note that the SOMA Experiment doesn't exist yet, so we pass `experiment_uri=None` to signify that."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ac21dd19-2fd5-41ec-98e5-2596e0795f0d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "rd2 = tiledbsoma.io.register_anndatas(\n",
+ " experiment_uri=None, # new Experiment, from scratch\n",
+ " adatas=ads,\n",
+ " measurement_name=measurement_name,\n",
+ " obs_field_name=\"obs_id\",\n",
+ " var_field_name=\"var_id\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "d6cff24a-1de8-46e1-b502-e69a0e92dc92",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Let's also look at `var`, as before. Since we had data for more cells but for the same genes, there is nothing new here. The `obs` table grew downward with the new cells, and `X` grew downward with new rows, but `var` stayed the same.\n",
- "\n",
- "In real-world data, occasionally you will see a gene expressed in subsequent data which wasn't expressed in the initial data. That's fine -- you'll simply see `var` grow just a bit for those newly encountered gene IDs, with corresponding new columns for `X`."
+ "data": {
+ "text/plain": [
+ "True"
]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
},
{
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " soma_joinid var_id gene_ids\n",
- "0 0 MIR1302-10 ENSG00000243485\n",
- "1 1 FAM138A ENSG00000237613\n",
- "2 2 OR4F5 ENSG00000186092\n",
- "3 3 RP11-34P13.7 ENSG00000238009\n",
- "4 4 RP11-34P13.8 ENSG00000239945\n",
- "... ... ... ...\n",
- "32733 32733 AC145205.1 ENSG00000215635\n",
- "32734 32734 BAGE5 ENSG00000268590\n",
- "32735 32735 CU459201.1 ENSG00000251180\n",
- "32736 32736 AC002321.2 ENSG00000215616\n",
- "32737 32737 AC002321.1 ENSG00000215611\n",
- "\n",
- "[32738 rows x 3 columns]\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " print(\n",
- " exp.ms[\"RNA\"]\n",
- " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
- " .concat()\n",
- " .to_pandas()\n",
- " )"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Registration: registering AnnData object.\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "And lastly, the `X` expression matrix which has grown downward with the new cells, while keeping the same width as we didn't introduce new genes:"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Registration: accumulated to nobs=8100 nvar=32738.\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " soma_dim_0 soma_dim_1 soma_data\n",
- "0 0 70 1.0\n",
- "1 0 166 1.0\n",
- "2 0 178 2.0\n",
- "3 0 326 1.0\n",
- "4 0 363 1.0\n",
- "... ... ... ...\n",
- "4573763 5399 32697 10.0\n",
- "4573764 5399 32698 70.0\n",
- "4573765 5399 32702 10.0\n",
- "4573766 5399 32705 10.0\n",
- "4573767 5399 32708 30.0\n",
- "\n",
- "[4573768 rows x 3 columns]\n",
- "\n",
- "((0, 5399), (0, 32732))\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " X = exp.ms[\"RNA\"].X[\"data\"]\n",
- " print(X.read().tables().concat().to_pandas())\n",
- " print()\n",
- " print(X.used_shape())"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Registration: complete.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# XXX TEMP\n",
+ "tiledbsoma.io.resize_experiment(exp.uri, nobs=rd2.get_obs_shape(), nvars=rd2.get_var_shapes())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "77429cf0",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Now that we've gotten the registrations for all the input AnnData objects, we can ingest them.\n",
+ "\n",
+ "Note:\n",
+ "\n",
+ "- Here we ingest them sequentially, in the same order as above.\n",
+ "- But we could also ingest them in any shuffled order.\n",
+ "- Or we could have multiple workers in ingest them in parallel, one worker per AnnData object, as long as the registration data are passed to each worker."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "27ed22b2",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "## Ingesting multiple datasets to a SOMA Experiment"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20241103-095132/obs\n",
+ "Wrote /tmp/append-example-20241103-095132/ms/RNA/var\n",
+ "Writing /tmp/append-example-20241103-095132/ms/RNA/X/data\n",
+ "Wrote /tmp/append-example-20241103-095132/ms/RNA/X/data\n",
+ "Wrote file:///tmp/append-example-20241103-095132\n",
+ "Wrote /tmp/append-example-20241103-095132/obs\n",
+ "Wrote /tmp/append-example-20241103-095132/ms/RNA/var\n",
+ "Writing /tmp/append-example-20241103-095132/ms/RNA/X/data\n",
+ "Wrote /tmp/append-example-20241103-095132/ms/RNA/X/data\n",
+ "Wrote file:///tmp/append-example-20241103-095132\n",
+ "Wrote /tmp/append-example-20241103-095132/obs\n",
+ "Wrote /tmp/append-example-20241103-095132/ms/RNA/var\n",
+ "Writing /tmp/append-example-20241103-095132/ms/RNA/X/data\n",
+ "Wrote /tmp/append-example-20241103-095132/ms/RNA/X/data\n",
+ "Wrote file:///tmp/append-example-20241103-095132\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Finally, we'll demonstrate combining multiple AnnDatas into one new experiment.\n",
- "\n",
- "The flow is pretty similar to the above:\n",
- "\n",
- "1. One call to `register_anndatas` or `register_h5ads` (passing all input AnnDatas/h5ads)\n",
- "2. One call to `from_anndata`/`from_h5ad` *for each input AnnData*\n",
- "\n",
- "Here's a helper function to simulate multiple lab runs. As above, where we used `pbmc3k` to simulate Monday and Tuesday data, here we use `pbmc3k` to simulate multiple AnnData objects."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {
- "tags": []
- },
- "outputs": [],
- "source": [
- "def make_ad(when, scale, obs_id_suffix):\n",
- " ad = ad1.copy()\n",
- " ad.obs.index = [e.replace(\"-1\", obs_id_suffix) for e in ad.obs.index]\n",
- " ad.obs[\"when\"] = [when] * len(ad.obs)\n",
- " ad.X *= scale\n",
- " return ad\n",
- "\n",
- "ads = [\n",
- " make_ad(when, scale, f\"-{idx + 3}\")\n",
- " for idx, (when, scale)\n",
- " in enumerate({\n",
- " \"Wednesday\": 20,\n",
- " \"Thursday\": 30,\n",
- " \"Friday\": 40,\n",
- " }.items())\n",
- "]"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "We'll ingest these AnnData objects, as before, but this time to a fresh/empty `/tmp` location:"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839/obs\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'/tmp/append-example-20240521-145839'"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n",
- "experiment_uri = f\"/tmp/append-example-{stamp}\"\n",
- "experiment_uri"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Here we'll register all the AnnData objects. Note that the SOMA Experiment doesn't exist yet, so we pass `experiment_uri=None` to signify that."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: registering AnnData object.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: accumulated to nobs=2700 nvar=32738.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: registering AnnData object.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: accumulated to nobs=5400 nvar=32738.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: registering AnnData object.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: accumulated to nobs=8100 nvar=32738.\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Registration: complete.\n"
- ]
- }
- ],
- "source": [
- "rd2 = tiledbsoma.io.register_anndatas(\n",
- " experiment_uri=None, # new Experiment, from scratch\n",
- " adatas=ads,\n",
- " measurement_name=measurement_name,\n",
- " obs_field_name=\"obs_id\",\n",
- " var_field_name=\"var_id\",\n",
- ")"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Now that we've gotten the registrations for all the input AnnData objects, we can ingest them.\n",
- "\n",
- "Note:\n",
- "\n",
- "- Here we ingest them sequentially, in the same order as above.\n",
- "- But we could also ingest them in any shuffled order.\n",
- "- Or we could have multiple workers in ingest them in parallel, one worker per AnnData object, as long as the registration data are passed to each worker."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/obs\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/obs\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/obs\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Wrote /tmp/append-example-20240521-145839\n"
- ]
- }
- ],
- "source": [
- "for ad in ads:\n",
- " tiledbsoma.io.from_anndata(\n",
- " experiment_uri,\n",
- " ad,\n",
- " measurement_name=measurement_name,\n",
- " registration_mapping=rd2,\n",
- " )"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839/obs\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Reading back the concatenated data, we see 2700 rows for each of {`-3`, `-4`, `-5`}:"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839/ms/RNA/var\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " obs_id n_genes_by_counts when\n",
- "0 AAACATACAACCAC-3 781 Wednesday\n",
- "1 AAACATTGAGCTAC-3 1352 Wednesday\n",
- "2 AAACATTGATCAGC-3 1131 Wednesday\n",
- "3 AAACCGTGCTTCCG-3 960 Wednesday\n",
- "4 AAACCGTGTATGCG-3 522 Wednesday\n",
- "... ... ... ...\n",
- "8095 TTTCGAACTCTCAT-5 1155 Friday\n",
- "8096 TTTCTACTGAGGCA-5 1227 Friday\n",
- "8097 TTTCTACTTCCTCG-5 622 Friday\n",
- "8098 TTTGCATGAGAGGC-5 454 Friday\n",
- "8099 TTTGCATGCCTCAC-5 724 Friday\n",
- "\n",
- "[8100 rows x 3 columns]\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " print(\n",
- " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
- " .concat()\n",
- " .to_pandas()\n",
- " )"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Writing /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "`var` is the same as in the single original Anndata objects (since we added more cells with all the same genes):"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839/ms/RNA/X/data\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " soma_joinid var_id gene_ids\n",
- "0 0 MIR1302-10 ENSG00000243485\n",
- "1 1 FAM138A ENSG00000237613\n",
- "2 2 OR4F5 ENSG00000186092\n",
- "3 3 RP11-34P13.7 ENSG00000238009\n",
- "4 4 RP11-34P13.8 ENSG00000239945\n",
- "... ... ... ...\n",
- "32733 32733 AC145205.1 ENSG00000215635\n",
- "32734 32734 BAGE5 ENSG00000268590\n",
- "32735 32735 CU459201.1 ENSG00000251180\n",
- "32736 32736 AC002321.2 ENSG00000215616\n",
- "32737 32737 AC002321.1 ENSG00000215611\n",
- "\n",
- "[32738 rows x 3 columns]\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " print(\n",
- " exp.ms[\"RNA\"]\n",
- " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
- " .concat()\n",
- " .to_pandas()\n",
- " )"
- ]
- },
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Wrote /tmp/append-example-20240521-145839\n"
+ ]
+ }
+ ],
+ "source": [
+ "for ad in ads:\n",
+ " tiledbsoma.io.from_anndata(\n",
+ " experiment_uri,\n",
+ " ad,\n",
+ " measurement_name=measurement_name,\n",
+ " registration_mapping=rd2,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e2e54f89",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Reading back the concatenated data, we see 2700 rows for each of {`-3`, `-4`, `-5`}:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "8f86fd3d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Finally, the `X` expression matrix contains 3x the entries as the original, but is also the same width (since we didn't introduce new genes):"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " obs_id n_genes_by_counts when\n",
+ "0 AAACATACAACCAC-3 781 Wednesday\n",
+ "1 AAACATTGAGCTAC-3 1352 Wednesday\n",
+ "2 AAACATTGATCAGC-3 1131 Wednesday\n",
+ "3 AAACCGTGCTTCCG-3 960 Wednesday\n",
+ "4 AAACCGTGTATGCG-3 522 Wednesday\n",
+ "... ... ... ...\n",
+ "2695 TTTCGAACTCTCAT-3 1155 Wednesday\n",
+ "2696 TTTCTACTGAGGCA-3 1227 Wednesday\n",
+ "2697 TTTCTACTTCCTCG-3 622 Wednesday\n",
+ "2698 TTTGCATGAGAGGC-3 454 Wednesday\n",
+ "2699 TTTGCATGCCTCAC-3 724 Wednesday\n",
+ "\n",
+ "[2700 rows x 3 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " print(\n",
+ " exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
+ " .concat()\n",
+ " .to_pandas()\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f8596b0",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "`var` is the same as in the single original Anndata objects (since we added more cells with all the same genes):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "bffce533",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " soma_dim_0 soma_dim_1 soma_data\n",
- "0 0 70 20.0\n",
- "1 0 166 20.0\n",
- "2 0 178 40.0\n",
- "3 0 326 20.0\n",
- "4 0 363 20.0\n",
- "... ... ... ...\n",
- "6860647 8099 32697 40.0\n",
- "6860648 8099 32698 280.0\n",
- "6860649 8099 32702 40.0\n",
- "6860650 8099 32705 40.0\n",
- "6860651 8099 32708 120.0\n",
- "\n",
- "[6860652 rows x 3 columns]\n",
- "\n",
- "((0, 8099), (0, 32732))\n"
- ]
- }
- ],
- "source": [
- "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
- " X = exp.ms[\"RNA\"].X[\"data\"]\n",
- " print(X.read().tables().concat().to_pandas())\n",
- " print()\n",
- " print(X.used_shape())"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " soma_joinid var_id gene_ids\n",
+ "0 0 MIR1302-10 ENSG00000243485\n",
+ "1 1 FAM138A ENSG00000237613\n",
+ "2 2 OR4F5 ENSG00000186092\n",
+ "3 3 RP11-34P13.7 ENSG00000238009\n",
+ "4 4 RP11-34P13.8 ENSG00000239945\n",
+ "... ... ... ...\n",
+ "32733 32733 AC145205.1 ENSG00000215635\n",
+ "32734 32734 BAGE5 ENSG00000268590\n",
+ "32735 32735 CU459201.1 ENSG00000251180\n",
+ "32736 32736 AC002321.2 ENSG00000215616\n",
+ "32737 32737 AC002321.1 ENSG00000215611\n",
+ "\n",
+ "[32738 rows x 3 columns]\n"
+ ]
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.8"
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " print(\n",
+ " exp.ms[\"RNA\"]\n",
+ " .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
+ " .concat()\n",
+ " .to_pandas()\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9a9737a0",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Finally, the `X` expression matrix contains 3x the entries as the original, but is also the same width (since we didn't introduce new genes):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "05cf63a0",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " soma_dim_0 soma_dim_1 soma_data\n",
+ "0 0 70 20.0\n",
+ "1 0 166 20.0\n",
+ "2 0 178 40.0\n",
+ "3 0 326 20.0\n",
+ "4 0 363 20.0\n",
+ "... ... ... ...\n",
+ "6860647 8099 32697 40.0\n",
+ "6860648 8099 32698 280.0\n",
+ "6860649 8099 32702 40.0\n",
+ "6860650 8099 32705 40.0\n",
+ "6860651 8099 32708 120.0\n",
+ "\n",
+ "[6860652 rows x 3 columns]\n",
+ "\n",
+ "((0, 8099), (0, 32732))\n"
+ ]
}
+ ],
+ "source": [
+ "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+ " X = exp.ms[\"RNA\"].X[\"data\"]\n",
+ " print(X.read().tables().concat().to_pandas())\n",
+ " print()\n",
+ " print(X.used_shape())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99e89175-335b-4ef8-95ec-290a702ae10d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 5
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
}
diff --git a/apis/python/notebooks/tutorial_soma_objects.ipynb b/apis/python/notebooks/tutorial_soma_objects.ipynb
index 9071d7c677..a4bb0215b3 100644
--- a/apis/python/notebooks/tutorial_soma_objects.ipynb
+++ b/apis/python/notebooks/tutorial_soma_objects.ipynb
@@ -52,19 +52,19 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 2,
"id": "c3af1793-e2be-45e1-8128-bb64536673f7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 5,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -86,7 +86,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 3,
"id": "228e9411-434e-4c55-8fb4-fef3216dca08",
"metadata": {
"tags": []
@@ -95,12 +95,12 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 6,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -111,7 +111,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 4,
"id": "5d92e331-5c6c-4971-b956-442996d5efa9",
"metadata": {
"tags": []
@@ -120,10 +120,10 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 7,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -178,7 +178,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 5,
"id": "66b67624-3cbe-4401-a297-e008cf18ab0b",
"metadata": {
"tags": []
@@ -187,15 +187,15 @@
{
"data": {
"text/plain": [
- "soma_joinid: int64\n",
- "obs_id: large_string\n",
- "n_genes: int64\n",
- "percent_mito: float\n",
- "n_counts: float\n",
- "louvain: large_string"
+ "soma_joinid: int64 not null\n",
+ "obs_id: large_string not null\n",
+ "n_genes: int64 not null\n",
+ "percent_mito: float not null\n",
+ "n_counts: float not null\n",
+ "louvain: large_string not null"
]
},
- "execution_count": 8,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -231,7 +231,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 6,
"id": "26676c4f-dfb8-4f48-9bc5-1a66ee085f9e",
"metadata": {
"tags": []
@@ -401,7 +401,7 @@
"[2638 rows x 6 columns]"
]
},
- "execution_count": 9,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -432,7 +432,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 7,
"id": "32bfed6c-b0b7-41ed-986c-df7d462498c4",
"metadata": {
"tags": []
@@ -599,7 +599,7 @@
"10 B cells "
]
},
- "execution_count": 10,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -620,7 +620,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 8,
"id": "703fe8ad-7123-4311-a58b-b00a27c7a483",
"metadata": {
"tags": []
@@ -726,7 +726,7 @@
"10 AAACTTGAAAAACG-1 1116"
]
},
- "execution_count": 11,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -747,7 +747,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 9,
"id": "a5ef3a97-abc3-4d80-ab48-1898fa64d566",
"metadata": {
"tags": []
@@ -917,7 +917,7 @@
"[75 rows x 6 columns]"
]
},
- "execution_count": 12,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -948,7 +948,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 10,
"id": "d437b606-8338-4220-966d-59c4bf48fd13",
"metadata": {
"tags": []
@@ -957,12 +957,12 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 13,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -983,7 +983,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 11,
"id": "0574abf8-5f72-4a05-a90f-608fdda2db07",
"metadata": {
"tags": []
@@ -992,12 +992,12 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 14,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -1038,7 +1038,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 12,
"id": "c8c2aa17-52d7-4bd5-a5f3-b58c18fdcb11",
"metadata": {
"tags": []
@@ -1047,11 +1047,11 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 23,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1071,7 +1071,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 13,
"id": "75035918-b26f-48b2-a47b-8ea08c308e37",
"metadata": {
"tags": []
@@ -1080,10 +1080,10 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 26,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -1105,7 +1105,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 14,
"id": "b12b2c0e-db32-48f1-b114-f867baf5be76",
"metadata": {
"tags": []
@@ -1114,12 +1114,12 @@
{
"data": {
"text/plain": [
- "soma_dim_0: int64\n",
- "soma_dim_1: int64\n",
- "soma_data: float"
+ "soma_dim_0: int64 not null\n",
+ "soma_dim_1: int64 not null\n",
+ "soma_data: float not null"
]
},
- "execution_count": 28,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -1148,7 +1148,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 15,
"id": "27858aed-aa89-4c45-bccc-38e9cfa5cbb2",
"metadata": {
"tags": []
@@ -1160,7 +1160,7 @@
"(2638, 1838)"
]
},
- "execution_count": 34,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -1183,7 +1183,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 16,
"id": "2c586c5c-055b-4bc7-9995-851dd802d961",
"metadata": {
"tags": []
@@ -1198,7 +1198,7 @@
"strides: (7352, 4)"
]
},
- "execution_count": 35,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -1219,7 +1219,7 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 17,
"id": "6d90e592-7a67-4a41-af08-05aa3807167a",
"metadata": {
"tags": []
@@ -1243,7 +1243,7 @@
" -0.13032717, -0.4713379 ]], dtype=float32)"
]
},
- "execution_count": 38,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -1266,7 +1266,7 @@
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 18,
"id": "4a3b1f45-017d-4b92-9f2e-c88e8e3aa234",
"metadata": {
"tags": []
@@ -1290,7 +1290,7 @@
" -0.16255915, -0.50339466]], dtype=float32)"
]
},
- "execution_count": 48,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -1302,7 +1302,7 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 19,
"id": "007f1e15-61cd-40b8-bb23-4102662ab3af",
"metadata": {
"tags": []
@@ -1314,7 +1314,7 @@
"(10, 1838)"
]
},
- "execution_count": 49,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -1335,7 +1335,7 @@
},
{
"cell_type": "code",
- "execution_count": 104,
+ "execution_count": 20,
"id": "d6d39b44-33b3-4cb7-8a34-d30b94899ad1",
"metadata": {
"tags": []
@@ -1353,7 +1353,7 @@
" [-0.10383061]], dtype=float32)"
]
},
- "execution_count": 104,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -1407,7 +1407,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 21,
"id": "71f8fed4-4ffd-4f30-a5b0-4e3a4a3730f3",
"metadata": {
"tags": []
@@ -1416,10 +1416,10 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 16,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -1440,7 +1440,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 22,
"id": "41897a5c-2225-49f9-b9f2-3a68a6ad8079",
"metadata": {
"tags": []
@@ -1449,12 +1449,12 @@
{
"data": {
"text/plain": [
- "soma_dim_0: int64\n",
- "soma_dim_1: int64\n",
- "soma_data: float"
+ "soma_dim_0: int64 not null\n",
+ "soma_dim_1: int64 not null\n",
+ "soma_data: float not null"
]
},
- "execution_count": 17,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -1475,7 +1475,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 23,
"id": "42c1d852-6492-4a5e-b1fe-bc9af3f83639",
"metadata": {
"tags": []
@@ -1484,10 +1484,10 @@
{
"data": {
"text/plain": [
- "(9223372036854773760, 9223372036854773760)"
+ "(2147483646, 2147483646)"
]
},
- "execution_count": 43,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@@ -1518,7 +1518,7 @@
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": 24,
"id": "2862f737-4f08-4886-9496-fe7771b4a581",
"metadata": {
"tags": []
@@ -1530,7 +1530,7 @@
"4848644"
]
},
- "execution_count": 56,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -1551,7 +1551,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 25,
"id": "eaa0f9aa-8167-4f26-a52f-4d9636dde37b",
"metadata": {
"tags": []
@@ -1560,10 +1560,10 @@
{
"data": {
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 44,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -1582,7 +1582,7 @@
},
{
"cell_type": "code",
- "execution_count": 60,
+ "execution_count": 26,
"id": "00a7899f-2d28-4f07-b438-ab4d4d6bcfe5",
"metadata": {
"tags": []
@@ -1721,7 +1721,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.11.9"
}
},
"nbformat": 4,
diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py
index b89f25e666..9f18d7d8e9 100644
--- a/apis/python/src/tiledbsoma/_dense_nd_array.py
+++ b/apis/python/src/tiledbsoma/_dense_nd_array.py
@@ -22,7 +22,7 @@
from ._exception import SOMAError, map_exception_for_create
from ._flags import DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN, NEW_SHAPE_FEATURE_FLAG_ENABLED
from ._tdb_handles import DenseNDArrayWrapper
-from ._types import OpenTimestamp, Slice
+from ._types import OpenTimestamp, Slice, StatusAndReason
from ._util import dense_indices_to_shape
from .options._soma_tiledb_context import (
SOMATileDBContext,
@@ -361,6 +361,22 @@ def resize(self, newshape: Sequence[Union[int, None]]) -> None:
else:
raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0")
+ def tiledbsoma_upgrade_shape(
+ self, newshape: Sequence[Union[int, None]], check_only: bool = False
+ ) -> StatusAndReason:
+ """Allows the array to have a resizeable shape as described in the TileDB-SOMA
+ 1.15 release notes. Raises an error if the new shape exceeds maxshape in
+ any dimension. Raises an error if the array already has a shape.
+ """
+ if NEW_SHAPE_FEATURE_FLAG_ENABLED and DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN:
+ if check_only:
+ return self._handle.tiledbsoma_can_upgrade_shape(newshape)
+ else:
+ self._handle.tiledbsoma_upgrade_shape(newshape)
+ return (True, "")
+ else:
+ raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0")
+
@classmethod
def _dim_capacity_and_extent(
cls,
diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py
index 915cfb67a2..9e160c79b0 100644
--- a/apis/python/src/tiledbsoma/_tdb_handles.py
+++ b/apis/python/src/tiledbsoma/_tdb_handles.py
@@ -658,6 +658,24 @@ def tiledbsoma_can_resize(
else:
raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0")
+ def tiledbsoma_upgrade_shape(self, newshape: Sequence[Union[int, None]]) -> None:
+ """Wrapper-class internals"""
+ if DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN:
+ self._handle.tiledbsoma_upgrade_shape(newshape)
+ else:
+ raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0")
+
+ def tiledbsoma_can_upgrade_shape(
+ self, newshape: Sequence[Union[int, None]]
+ ) -> StatusAndReason:
+ """Wrapper-class internals"""
+ if DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN:
+ return cast(
+ StatusAndReason, self._handle.tiledbsoma_can_upgrade_shape(newshape)
+ )
+ else:
+ raise NotImplementedError("Not implemented for libtiledbsoma < 2.27.0")
+
class SparseNDArrayWrapper(SOMAArrayWrapper[clib.SOMASparseNDArray]):
"""Wrapper around a Pybind11 SparseNDArrayWrapper handle."""
diff --git a/apis/python/src/tiledbsoma/io/shaping.py b/apis/python/src/tiledbsoma/io/shaping.py
index ddd5e30eac..de847f532f 100644
--- a/apis/python/src/tiledbsoma/io/shaping.py
+++ b/apis/python/src/tiledbsoma/io/shaping.py
@@ -33,6 +33,18 @@ class SizingArgs(TypedDict):
output_handle: Printable
+def _find_old_sparse_ndarray_bounds(
+ snda: tiledbsoma.SparseNDArray,
+) -> Tuple[Tuple[int, int], ...]:
+ # New arrays (created by tiledbsoma 1.15 and above) will have the new shape.
+ # Older will have used_shape ...
+ # ... except _really_ old won't even have that.
+ try:
+ return snda.used_shape()
+ except tiledbsoma.SOMAError:
+ return snda.non_empty_domain()
+
+
def show_experiment_shapes(
uri: str,
*,
@@ -257,7 +269,8 @@ def _leaf_visitor_show_shapes(
elif isinstance(item, tiledbsoma.SparseNDArray):
_print_leaf_node_banner("SparseNDArray", node_name, item.uri, args)
- _bannerize(args, "used_shape", item.used_shape())
+ ####_bannerize(args, "used_shape", item.used_shape())
+ _bannerize(args, "used_shape", _find_old_sparse_ndarray_bounds(item))
_bannerize(args, "shape", item.shape)
_bannerize(args, "maxshape", item.maxshape)
_bannerize(args, "upgraded", item.tiledbsoma_has_upgraded_shape)
@@ -306,7 +319,8 @@ def _leaf_visitor_upgrade(
print(" Already upgraded", file=args["output_handle"])
elif isinstance(item, tiledbsoma.SparseNDArray):
- used_shape = item.used_shape()
+ #### used_shape = item.used_shape()
+ used_shape = _find_old_sparse_ndarray_bounds(item)
new_shape = tuple(e[1] + 1 for e in used_shape)
_print_leaf_node_banner("SparseNDArray", node_name, item.uri, args)
diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py
index 4777a4d6b0..13362da3f1 100644
--- a/apis/python/tests/test_shape.py
+++ b/apis/python/tests/test_shape.py
@@ -228,6 +228,18 @@ def test_dense_nd_array_basics(tmp_path):
else:
assert dnda.shape == (100, 200)
+ if (
+ tiledbsoma._flags.DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN
+ and tiledbsoma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED
+ ):
+ with tiledbsoma.DenseNDArray.open(uri) as dnda:
+ ok, msg = dnda.tiledbsoma_upgrade_shape((600, 700), check_only=True)
+ assert not ok
+ assert (
+ msg
+ == "tiledbsoma_can_upgrade_shape: array already has a shape: please use resize"
+ )
+
@pytest.mark.parametrize(
"soma_joinid_domain",