Skip to content

Commit

Permalink
✨Iterate on perturbation use case (#156)
Browse files Browse the repository at this point in the history
Signed-off-by: zethson <[email protected]>
  • Loading branch information
Zethson authored Sep 26, 2024
1 parent d56aada commit 08fae33
Showing 1 changed file with 27 additions and 90 deletions.
117 changes: 27 additions & 90 deletions docs/perturbation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"In this guide we demonstrate how to curate a complex, real world perturbation dataset [McFarland et al. 2020](https://www.nature.com/articles/s41467-020-17440-w) using the {mod}`wetlab` schema.\n",
"To speed up processing, we load a downsampled version (1000 cells) of the original dataset."
"This guide demonstrates how to curate a complex, real world perturbation dataset [McFarland et al. 2020](https://www.nature.com/articles/s41467-020-17440-w) using the {mod}`wetlab` schema."
]
},
{
Expand All @@ -35,15 +34,14 @@
},
"outputs": [],
"source": [
"from lamin_usecases import datasets as ds\n",
"import lamindb as ln\n",
"import bionty as bt\n",
"import wetlab as wl\n",
"import pandas as pd\n",
"\n",
"pd.set_option(\"display.max_columns\", None)\n",
"\n",
"ln.context.uid = \"K6sInKIQW5nt0000\"\n",
"ln.context.uid = \"K6sInKIQW5nt0002\"\n",
"ln.context.track()"
]
},
Expand All @@ -57,10 +55,27 @@
},
"outputs": [],
"source": [
"adata = ds.anndata_mcfarland()\n",
"# See https://lamin.ai/laminlabs/lamindata/transform/13VINnFk89PE0004 to learn how this dataset was prepared\n",
"adata = ln.Artifact.using(\"laminlabs/lamindata\").get(uid=\"Xk7Qaik9vBLV4PKf0000\").load()\n",
"adata.obs.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"curate = ln.Curator.from_anndata(\n",
" adata,\n",
" var_index=bt.Gene.ensembl_gene_id,\n",
" organism=\"human\",\n",
" sources={\"var_index\": bt.Source.filter(entity=\"bionty.Gene\", version=\"release-112\", organism=\"human\").one()}\n",
")\n",
"\n",
"curate.validate()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -82,86 +97,6 @@
"## Curate non-perturbation metadata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To keep this guide focused on the perturbation metadata, we have already curated the most important non-perturbation metadata."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
":::{dropdown} How was the original dataset curated so far?\n",
"\n",
"```python\n",
"\n",
"import lamindb as ln\n",
"import bionty as bt\n",
"\n",
"categoricals = {\n",
" \"DepMap_ID\": bt.CellLine.ontology_id,\n",
" \"cell_line\": bt.CellLine.name,\n",
" \"disease\": bt.Disease.name,\n",
" \"organism\": bt.Organism.name,\n",
" \"perturbation_type\": ln.ULabel.name,\n",
" \"sex\": bt.Phenotype.name,\n",
" \"time\": ln.ULabel.name,\n",
" \"tissue_type\": ln.ULabel.name,\n",
"}\n",
"sources = {\n",
" \"DepMap_ID\": bt.Source.filter(name=\"depmap\").one(),\n",
" \"cell_line\": bt.Source.filter(name=\"depmap\").one(),\n",
"}\n",
"\n",
"curate = ln.Curator.from_anndata(\n",
" adata,\n",
" var_index=bt.Gene.ensembl_gene_id,\n",
" categoricals=categoricals,\n",
" organism=\"human\",\n",
" sources=sources,\n",
")\n",
"\n",
"curate.validate()\n",
"\n",
"# Map mix of ensembl IDs and gene symbols in the var_index to ensembl IDs\n",
"gene_mapper = bt.Gene.standardize(\n",
" curate.non_validated[\"var_index\"],\n",
" field=\"symbol\",\n",
" return_field=\"ensembl_gene_id\",\n",
" return_mapper=True,\n",
" organism=\"human\",\n",
")\n",
"adata.var.index = adata.var.index.map(lambda x: gene_mapper.get(x, x))\n",
"\n",
"adata.obs[\"disease\"] = adata.obs[\"disease\"].cat.rename_categories(\n",
" {\n",
" \"colon/colorectal cancer\": \"colorectal cancer\",\n",
" \"rhabdoid\": \"rhabdoid tumor\",\n",
" \"bladder cancer\": \"urinary bladder carcinoma\",\n",
" \"endometrial/uterine cancer\": \"uterine corpus cancer\",\n",
" }\n",
")\n",
"\n",
"adata.obs[\"cell_line\"] = bt.CellLine.public(\n",
" source=bt.Source.filter(name=\"depmap\").one()\n",
").standardize(adata.obs[\"cell_line\"], field=\"name\")\n",
"bt.CellLine.public(source=bt.Source.filter(name=\"depmap\").one()).inspect(\n",
" adata.obs[\"cell_line\"], field=\"name\"\n",
")\n",
"\n",
"adata.obs[\"perturbation\"] = adata.obs[\"perturbation\"].cat.rename_categories(\n",
" lambda category: category.lower()\n",
")\n",
"adata.obs = adata.obs.rename(columns={\"DepMap_ID\": \"depmap_id\"})\n",
"\n",
"adata = adata[:, ~adata.var.index.isin(curate.non_validated[\"var_index\"])].copy()\n",
"```\n",
"\n",
":::"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -183,6 +118,7 @@
" \"tissue_type\": ln.ULabel.name,\n",
"}\n",
"sources = {\n",
" \"var_index\": bt.Source.filter(entity=\"bionty.Gene\", version=\"release-112\", organism=\"human\").one(),\n",
" \"depmap_id\": bt.Source.filter(name=\"depmap\").one(),\n",
" \"cell_line\": bt.Source.filter(name=\"depmap\").one(),\n",
"}\n",
Expand Down Expand Up @@ -340,11 +276,12 @@
"genetic_treatments = []\n",
"for name, symbol, target_name in treatments:\n",
" treatment = wl.GeneticTreatment(system=\"CRISPR KO\", name=name).save()\n",
" gene = (\n",
" bt.Gene.from_source(symbol=symbol, organism=organism).save()\n",
" if symbol != \"lacz\"\n",
" else bt.Gene(symbol=symbol, organism=organism).save()\n",
" )\n",
" if symbol != \"lacz\":\n",
" gene_result = bt.Gene.from_source(symbol=symbol, organism=organism)\n",
" gene = gene_result[0] if isinstance(gene_result, list) else gene_result\n",
" gene = gene.save()\n",
" else:\n",
" gene = bt.Gene(symbol=symbol, organism=organism).save()\n",
" target = wl.TreatmentTarget(name=target_name).save()\n",
" target.genes.add(gene)\n",
" treatment.targets.add(target)\n",
Expand Down

0 comments on commit 08fae33

Please sign in to comment.