Re-do of #157

single-cell-data · Jun 21, 2022 · 3e05ce7 · 3e05ce7
1 parent c9bb1f7
commit 3e05ce7
Show file tree

Hide file tree

Showing 13 changed files with 1,082 additions and 120 deletions.
diff --git a/_quarto.yml b/_quarto.yml
@@ -65,8 +65,16 @@ website:
                   text: "Comparing AnnData and TileDB files"
                 - href: "apis/python/examples/inspecting-schema.md"
                   text: "Inspecting SOMA schemas"
+                - href: "apis/python/examples/soma-slice-query.md"
+                  text: "SOMA slice query"
+                - href: "apis/python/examples/normalizing.md"
+                  text: "Normalizing a collection"
                 - href: "apis/python/examples/soco-reconnaissance.md"
                   text: "SOMA-collection reconnaissance"
+                - href: "apis/python/examples/soco-slice-query.md"
+                  text: "SOMA-collection slice query"
+                - href: "apis/python/examples/soco-batch-query.md"
+                  text: "SOMA-collection batch query"
 
             - section: "Python API"
               contents:

diff --git a/apis/python/examples/collection-counts.py b/apis/python/examples/collection-counts.py
diff --git a/apis/python/examples/ingesting-data-files.md b/apis/python/examples/ingesting-data-files.md
@@ -8,15 +8,9 @@ tools/ingestor -o /mini-corpus/tiledb-data -n /mini-corpus/anndata/10x_pbmc68k_r
 ...
 ```
 
-Note this can take several hours total. The benefit of using an optimized storage solution (with
-admittedly non-negligible ingest time) is that all subsequent queries benefit from that optimized
-storage. In particular, various cross-corpus data queries shown in these examples take just seconds
-or minutes.
-
-A key point is **write once, read from multiple tools** -- in particular, using `tiledbsc-py` (this
-package) or [`tiledbsc-r`](https://github.com/TileDB-Inc/tiledbsc) you can read SOMAs in either
-language, regardless of which language was used to store them. This lets you use
-best-in-class/state-of-the-art analysis algorithms, whichever language they're implemented in.
+Note this takes many hours. The benefit of using an optimized storage solution (with admittedly
+non-negligible ingest time) is that all subsequent queries benefit from that optimized storage. In
+particular, various cross-corpus data queries shown in these examples take just seconds or minutes.
 
 ## Populate a SOMA collection
 
@@ -36,14 +30,6 @@ populate-soco -o /mini-corpus/soco -a /mini-corpus/tiledb-data/*
 
 Note this is quite quick.
 
-As a keystroke-saver, use the `tools/ingestor` script's `--soco` option which will populate the SOMA
-collection at ingest time, so you don't even have to run `populate-soco` as an afterstep.
-
-```
-tools/ingestor -o /mini-corpus/tiledb-data --soco -n /mini-corpus/anndata/0cfab2d4-1b79-444e-8cbe-2ca9671ca85e.h5ad
-tools/ingestor -o /mini-corpus/tiledb-data --soco -n /mini-corpus/anndata/10x_pbmc68k_reduced.h5ad
-```
-
 ## Names and URIs
 
 Next let's start taking a look across the collection.

diff --git a/apis/python/examples/pre-query.py b/apis/python/examples/pre-query.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+
+# Invoke this with, for example,
+#
+#   peek-soco ./soma-collection
+#
+# -- then you can inspect the SOMACollection object
+
+import tiledb
+import tiledbsc as t
+
+import pandas
+
+import sys, os
+from typing import List, Dict
+
+# ================================================================
+def main():
+    if len(sys.argv) == 1:
+        soco_path = "soma-collection"
+    elif len(sys.argv) == 2:
+        soco_path = sys.argv[1]
+    else:
+        print(f"{sys.argv[0]}: need just one soma-collection path.", file=sys.stderr)
+        sys.exit(1)
+
+    soco = t.SOMACollection(soco_path)
+
+    #    print()
+    #    print("================================================================")
+    #    print("NAMES AND URIS")
+    #    print_names_and_uris(soco)
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("OBS NAMES")
+    #    show_obs_names(soco)
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("VAR NAMES")
+    #    show_var_names(soco)
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("SOMAS HAVING ALL THREE")
+    #    show_somas_with_all_three(soco)
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("OBS_ID COUNTS")
+    #    show_obs_id_counts(soco)
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("VAR_ID COUNTS")
+    #    show_var_id_counts(soco)
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("OBS UNIQUE VALUES FOR CELL_TYPE")
+    #    show_obs_column_unique_values(soco, "cell_type")
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("OBS UNIQUE VALUES FOR FEATURE_NAME")
+    #    show_var_column_unique_values(soco, "feature_name")
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("OBS VALUE COUNTS FOR CELL_TYPE AND TISSUE")
+    #    show_obs_value_counts(soco, ["cell_type", "tissue"])
+    #
+    #    print()
+    #    print("================================================================")
+    #    print("VAR VALUE COUNTS FOR CELL_TYPE AND FEATURE_NAME")
+    #    show_var_value_counts(soco, ["feature_name"])
+
+    print()
+    print("================================================================")
+    print("SHOW SOMAS HAVING")
+    show_somas_having(
+        soco,
+        {"cell_type": ["B cell", "T cell"], "tissue": ["blood", "lung"]},
+        {"feature_name": ["MT-CO3"]},
+    )
+
+
+# ----------------------------------------------------------------
+def print_names_and_uris(soco: t.SOMACollection) -> None:
+    for soma in soco:
+        print("%-40s %s" % (soma.name, soma.uri))
+
+
+# ----------------------------------------------------------------
+def show_obs_names(soco: t.SOMACollection) -> None:
+    for soma in soco:
+        print(soma.uri)
+        for attr_name in soma.obs.keys():
+            print("  obs", attr_name)
+
+
+# ----------------------------------------------------------------
+def show_var_names(soco: t.SOMACollection) -> None:
+    for soma in soco:
+        print(soma.uri)
+        for attr_name in soma.var.keys():
+            print("  var", attr_name)
+
+
+# ----------------------------------------------------------------
+def show_somas_with_all_three(soco: t.SOMACollection) -> None:
+    for soma in soco:
+        if "cell_type" in soma.obs.attr_names():
+            if "tissue" in soma.obs.attr_names():
+                if "feature_name" in soma.var.attr_names():
+                    print(soma.uri)
+
+
+# ----------------------------------------------------------------
+def show_obs_id_counts(soco: t.SOMACollection) -> None:
+    counts = {}
+    for soma in soco:
+        for oid in soma.obs.ids():
+            if oid in counts:
+                counts[oid] += 1
+            else:
+                counts[oid] = 1
+    df = pandas.DataFrame.from_dict(
+        {"obs_id": counts.keys(), "counts": counts.values()}
+    )
+    # print(df.head())
+    print(df)
+
+
+# ----------------------------------------------------------------
+def show_var_id_counts(soco: t.SOMACollection) -> None:
+    counts = {}
+    for soma in soco:
+        for oid in soma.var.ids():
+            if oid in counts:
+                counts[oid] += 1
+            else:
+                counts[oid] = 1
+    df = pandas.DataFrame.from_dict(
+        {"var_id": counts.keys(), "counts": counts.values()}
+    )
+    # print(df.head())
+    print(df)
+
+
+# ----------------------------------------------------------------
+def show_obs_column_unique_values(soco: t.SOMACollection, col_name: str) -> None:
+    for soma in soco:
+        print()
+        print(soma.uri)
+        if col_name in soma.obs.keys():
+            print(soma.obs.df()[col_name].unique())
+
+
+# ----------------------------------------------------------------
+def show_var_column_unique_values(soco: t.SOMACollection, col_name: str) -> None:
+    for soma in soco:
+        print()
+        print(soma.uri)
+        if col_name in soma.var.keys():
+            print(soma.var.df()[col_name].unique())
+
+
+# ----------------------------------------------------------------
+def show_obs_value_counts(soco: t.SOMACollection, obs_labels: List[str]) -> None:
+
+    for obs_label in obs_labels:
+        counts = {}
+
+        for soma in soco:
+            print("...", soma.name)
+            # print("\n".join(sorted(soma.obs.attr_names())))
+            obs = soma.obs.df()
+
+            if not obs_label in obs:
+                continue
+
+            obs_label_values = sorted(list(set(obs[obs_label])))
+            for obs_label_value in obs_label_values:
+                if obs_label_value in counts:
+                    counts[obs_label_value] += 1
+                else:
+                    counts[obs_label_value] = 1
+
+        print(
+            "----------------------------------------------------------------",
+            obs_label,
+        )
+        for k, v in dict(sorted(counts.items(), key=lambda item: item[1])).items():
+            print(k, v)
+
+
+# ----------------------------------------------------------------
+def show_var_value_counts(soco: t.SOMACollection, var_labels: List[str]) -> None:
+
+    for var_label in var_labels:
+        counts = {}
+
+        for soma in soco:
+            print("...", soma.name)
+            # print("\n".join(sorted(soma.var.attr_names())))
+            var = soma.var.df()
+
+            if not var_label in var:
+                continue
+
+            var_label_values = sorted(list(set(var[var_label])))
+            for var_label_value in var_label_values:
+                if var_label_value in counts:
+                    counts[var_label_value] += 1
+                else:
+                    counts[var_label_value] = 1
+
+        print(
+            "----------------------------------------------------------------",
+            var_label,
+        )
+        for k, v in dict(sorted(counts.items(), key=lambda item: item[1])).items():
+            print(k, v)
+
+
+# ----------------------------------------------------------------
+def show_somas_having(
+    soco: t.SOMACollection,
+    obs_labels_to_values: Dict[str, List],
+    var_labels_to_values: Dict[str, List],
+) -> None:
+
+    for soma in soco:
+        print(soma.uri)
+
+        obs = soma.obs.df()
+        for obs_label in obs_labels_to_values:
+            if not obs_label in obs:
+                print("out1")
+                continue
+            soma_obs_label_values = sorted(list(set(obs[obs_label])))
+            for sought_obs_label_value in obs_labels_to_values[obs_label]:
+                if sought_obs_label_value in soma_obs_label_values:
+                    print("  found obs", sought_obs_label_value)
+
+        var = soma.var.df()
+        for var_label in var_labels_to_values:
+            if not var_label in var:
+                print("out2")
+                continue
+            soma_var_label_values = sorted(list(set(var[var_label])))
+            for sought_var_label_value in var_labels_to_values[var_label]:
+                if sought_var_label_value in soma_var_label_values:
+                    print("  found var", sought_var_label_value)
+
+
+# ================================================================
+if __name__ == "__main__":
+    main()