Skip to content

Commit

Permalink
Re-do of #157
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Jun 21, 2022
1 parent c9bb1f7 commit 3e05ce7
Show file tree
Hide file tree
Showing 13 changed files with 1,082 additions and 120 deletions.
8 changes: 8 additions & 0 deletions _quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,16 @@ website:
text: "Comparing AnnData and TileDB files"
- href: "apis/python/examples/inspecting-schema.md"
text: "Inspecting SOMA schemas"
- href: "apis/python/examples/soma-slice-query.md"
text: "SOMA slice query"
- href: "apis/python/examples/normalizing.md"
text: "Normalizing a collection"
- href: "apis/python/examples/soco-reconnaissance.md"
text: "SOMA-collection reconnaissance"
- href: "apis/python/examples/soco-slice-query.md"
text: "SOMA-collection slice query"
- href: "apis/python/examples/soco-batch-query.md"
text: "SOMA-collection batch query"

- section: "Python API"
contents:
Expand Down
47 changes: 0 additions & 47 deletions apis/python/examples/collection-counts.py

This file was deleted.

20 changes: 3 additions & 17 deletions apis/python/examples/ingesting-data-files.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,9 @@ tools/ingestor -o /mini-corpus/tiledb-data -n /mini-corpus/anndata/10x_pbmc68k_r
...
```

Note this can take several hours total. The benefit of using an optimized storage solution (with
admittedly non-negligible ingest time) is that all subsequent queries benefit from that optimized
storage. In particular, various cross-corpus data queries shown in these examples take just seconds
or minutes.

A key point is **write once, read from multiple tools** -- in particular, using `tiledbsc-py` (this
package) or [`tiledbsc-r`](https://github.com/TileDB-Inc/tiledbsc) you can read SOMAs in either
language, regardless of which language was used to store them. This lets you use
best-in-class/state-of-the-art analysis algorithms, whichever language they're implemented in.
Note this takes many hours. The benefit of using an optimized storage solution (with admittedly
non-negligible ingest time) is that all subsequent queries benefit from that optimized storage. In
particular, various cross-corpus data queries shown in these examples take just seconds or minutes.

## Populate a SOMA collection

Expand All @@ -36,14 +30,6 @@ populate-soco -o /mini-corpus/soco -a /mini-corpus/tiledb-data/*

Note this is quite quick.

As a keystroke-saver, use the `tools/ingestor` script's `--soco` option which will populate the SOMA
collection at ingest time, so you don't even have to run `populate-soco` as an afterstep.

```
tools/ingestor -o /mini-corpus/tiledb-data --soco -n /mini-corpus/anndata/0cfab2d4-1b79-444e-8cbe-2ca9671ca85e.h5ad
tools/ingestor -o /mini-corpus/tiledb-data --soco -n /mini-corpus/anndata/10x_pbmc68k_reduced.h5ad
```

## Names and URIs

Next let's start taking a look across the collection.
Expand Down
261 changes: 261 additions & 0 deletions apis/python/examples/pre-query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
#!/usr/bin/env python

# Invoke this with, for example,
#
# peek-soco ./soma-collection
#
# -- then you can inspect the SOMACollection object

import tiledb
import tiledbsc as t

import pandas

import sys, os
from typing import List, Dict

# ================================================================
def main():
if len(sys.argv) == 1:
soco_path = "soma-collection"
elif len(sys.argv) == 2:
soco_path = sys.argv[1]
else:
print(f"{sys.argv[0]}: need just one soma-collection path.", file=sys.stderr)
sys.exit(1)

soco = t.SOMACollection(soco_path)

# print()
# print("================================================================")
# print("NAMES AND URIS")
# print_names_and_uris(soco)
#
# print()
# print("================================================================")
# print("OBS NAMES")
# show_obs_names(soco)
#
# print()
# print("================================================================")
# print("VAR NAMES")
# show_var_names(soco)
#
# print()
# print("================================================================")
# print("SOMAS HAVING ALL THREE")
# show_somas_with_all_three(soco)
#
# print()
# print("================================================================")
# print("OBS_ID COUNTS")
# show_obs_id_counts(soco)
#
# print()
# print("================================================================")
# print("VAR_ID COUNTS")
# show_var_id_counts(soco)
#
# print()
# print("================================================================")
# print("OBS UNIQUE VALUES FOR CELL_TYPE")
# show_obs_column_unique_values(soco, "cell_type")
#
# print()
# print("================================================================")
# print("OBS UNIQUE VALUES FOR FEATURE_NAME")
# show_var_column_unique_values(soco, "feature_name")
#
# print()
# print("================================================================")
# print("OBS VALUE COUNTS FOR CELL_TYPE AND TISSUE")
# show_obs_value_counts(soco, ["cell_type", "tissue"])
#
# print()
# print("================================================================")
# print("VAR VALUE COUNTS FOR CELL_TYPE AND FEATURE_NAME")
# show_var_value_counts(soco, ["feature_name"])

print()
print("================================================================")
print("SHOW SOMAS HAVING")
show_somas_having(
soco,
{"cell_type": ["B cell", "T cell"], "tissue": ["blood", "lung"]},
{"feature_name": ["MT-CO3"]},
)


# ----------------------------------------------------------------
def print_names_and_uris(soco: t.SOMACollection) -> None:
for soma in soco:
print("%-40s %s" % (soma.name, soma.uri))


# ----------------------------------------------------------------
def show_obs_names(soco: t.SOMACollection) -> None:
for soma in soco:
print(soma.uri)
for attr_name in soma.obs.keys():
print(" obs", attr_name)


# ----------------------------------------------------------------
def show_var_names(soco: t.SOMACollection) -> None:
for soma in soco:
print(soma.uri)
for attr_name in soma.var.keys():
print(" var", attr_name)


# ----------------------------------------------------------------
def show_somas_with_all_three(soco: t.SOMACollection) -> None:
for soma in soco:
if "cell_type" in soma.obs.attr_names():
if "tissue" in soma.obs.attr_names():
if "feature_name" in soma.var.attr_names():
print(soma.uri)


# ----------------------------------------------------------------
def show_obs_id_counts(soco: t.SOMACollection) -> None:
counts = {}
for soma in soco:
for oid in soma.obs.ids():
if oid in counts:
counts[oid] += 1
else:
counts[oid] = 1
df = pandas.DataFrame.from_dict(
{"obs_id": counts.keys(), "counts": counts.values()}
)
# print(df.head())
print(df)


# ----------------------------------------------------------------
def show_var_id_counts(soco: t.SOMACollection) -> None:
counts = {}
for soma in soco:
for oid in soma.var.ids():
if oid in counts:
counts[oid] += 1
else:
counts[oid] = 1
df = pandas.DataFrame.from_dict(
{"var_id": counts.keys(), "counts": counts.values()}
)
# print(df.head())
print(df)


# ----------------------------------------------------------------
def show_obs_column_unique_values(soco: t.SOMACollection, col_name: str) -> None:
for soma in soco:
print()
print(soma.uri)
if col_name in soma.obs.keys():
print(soma.obs.df()[col_name].unique())


# ----------------------------------------------------------------
def show_var_column_unique_values(soco: t.SOMACollection, col_name: str) -> None:
for soma in soco:
print()
print(soma.uri)
if col_name in soma.var.keys():
print(soma.var.df()[col_name].unique())


# ----------------------------------------------------------------
def show_obs_value_counts(soco: t.SOMACollection, obs_labels: List[str]) -> None:

for obs_label in obs_labels:
counts = {}

for soma in soco:
print("...", soma.name)
# print("\n".join(sorted(soma.obs.attr_names())))
obs = soma.obs.df()

if not obs_label in obs:
continue

obs_label_values = sorted(list(set(obs[obs_label])))
for obs_label_value in obs_label_values:
if obs_label_value in counts:
counts[obs_label_value] += 1
else:
counts[obs_label_value] = 1

print(
"----------------------------------------------------------------",
obs_label,
)
for k, v in dict(sorted(counts.items(), key=lambda item: item[1])).items():
print(k, v)


# ----------------------------------------------------------------
def show_var_value_counts(soco: t.SOMACollection, var_labels: List[str]) -> None:

for var_label in var_labels:
counts = {}

for soma in soco:
print("...", soma.name)
# print("\n".join(sorted(soma.var.attr_names())))
var = soma.var.df()

if not var_label in var:
continue

var_label_values = sorted(list(set(var[var_label])))
for var_label_value in var_label_values:
if var_label_value in counts:
counts[var_label_value] += 1
else:
counts[var_label_value] = 1

print(
"----------------------------------------------------------------",
var_label,
)
for k, v in dict(sorted(counts.items(), key=lambda item: item[1])).items():
print(k, v)


# ----------------------------------------------------------------
def show_somas_having(
soco: t.SOMACollection,
obs_labels_to_values: Dict[str, List],
var_labels_to_values: Dict[str, List],
) -> None:

for soma in soco:
print(soma.uri)

obs = soma.obs.df()
for obs_label in obs_labels_to_values:
if not obs_label in obs:
print("out1")
continue
soma_obs_label_values = sorted(list(set(obs[obs_label])))
for sought_obs_label_value in obs_labels_to_values[obs_label]:
if sought_obs_label_value in soma_obs_label_values:
print(" found obs", sought_obs_label_value)

var = soma.var.df()
for var_label in var_labels_to_values:
if not var_label in var:
print("out2")
continue
soma_var_label_values = sorted(list(set(var[var_label])))
for sought_var_label_value in var_labels_to_values[var_label]:
if sought_var_label_value in soma_var_label_values:
print(" found var", sought_var_label_value)


# ================================================================
if __name__ == "__main__":
main()
Loading

0 comments on commit 3e05ce7

Please sign in to comment.