✨Iterate on perturbation use case (#156)

Signed-off-by: zethson <[email protected]>
laminlabs · Sep 26, 2024 · 08fae33 · 08fae33
1 parent d56aada
commit 08fae33
Showing 1 changed file with 27 additions and 90 deletions.
diff --git a/docs/perturbation.ipynb b/docs/perturbation.ipynb
@@ -11,8 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this guide we demonstrate how to curate a complex, real world perturbation dataset [McFarland et al. 2020](https://www.nature.com/articles/s41467-020-17440-w) using the {mod}`wetlab` schema.\n",
-    "To speed up processing, we load a downsampled version (1000 cells) of the original dataset."
+    "This guide demonstrates how to curate a complex, real world perturbation dataset [McFarland et al. 2020](https://www.nature.com/articles/s41467-020-17440-w) using the {mod}`wetlab` schema."
    ]
   },
   {
@@ -35,15 +34,14 @@
    },
    "outputs": [],
    "source": [
-    "from lamin_usecases import datasets as ds\n",
     "import lamindb as ln\n",
     "import bionty as bt\n",
     "import wetlab as wl\n",
     "import pandas as pd\n",
     "\n",
     "pd.set_option(\"display.max_columns\", None)\n",
     "\n",
-    "ln.context.uid = \"K6sInKIQW5nt0000\"\n",
+    "ln.context.uid = \"K6sInKIQW5nt0002\"\n",
     "ln.context.track()"
    ]
   },
@@ -57,10 +55,27 @@
    },
    "outputs": [],
    "source": [
-    "adata = ds.anndata_mcfarland()\n",
+    "# See https://lamin.ai/laminlabs/lamindata/transform/13VINnFk89PE0004 to learn how this dataset was prepared\n",
+    "adata = ln.Artifact.using(\"laminlabs/lamindata\").get(uid=\"Xk7Qaik9vBLV4PKf0000\").load()\n",
     "adata.obs.head(3)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "curate = ln.Curator.from_anndata(\n",
+    "    adata,\n",
+    "    var_index=bt.Gene.ensembl_gene_id,\n",
+    "    organism=\"human\",\n",
+    "    sources={\"var_index\": bt.Source.filter(entity=\"bionty.Gene\", version=\"release-112\", organism=\"human\").one()}\n",
+    ")\n",
+    "\n",
+    "curate.validate()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -82,86 +97,6 @@
     "## Curate non-perturbation metadata"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To keep this guide focused on the perturbation metadata, we have already curated the most important non-perturbation metadata."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ":::{dropdown} How was the original dataset curated so far?\n",
-    "\n",
-    "```python\n",
-    "\n",
-    "import lamindb as ln\n",
-    "import bionty as bt\n",
-    "\n",
-    "categoricals = {\n",
-    "    \"DepMap_ID\": bt.CellLine.ontology_id,\n",
-    "    \"cell_line\": bt.CellLine.name,\n",
-    "    \"disease\": bt.Disease.name,\n",
-    "    \"organism\": bt.Organism.name,\n",
-    "    \"perturbation_type\": ln.ULabel.name,\n",
-    "    \"sex\": bt.Phenotype.name,\n",
-    "    \"time\": ln.ULabel.name,\n",
-    "    \"tissue_type\": ln.ULabel.name,\n",
-    "}\n",
-    "sources = {\n",
-    "    \"DepMap_ID\": bt.Source.filter(name=\"depmap\").one(),\n",
-    "    \"cell_line\": bt.Source.filter(name=\"depmap\").one(),\n",
-    "}\n",
-    "\n",
-    "curate = ln.Curator.from_anndata(\n",
-    "    adata,\n",
-    "    var_index=bt.Gene.ensembl_gene_id,\n",
-    "    categoricals=categoricals,\n",
-    "    organism=\"human\",\n",
-    "    sources=sources,\n",
-    ")\n",
-    "\n",
-    "curate.validate()\n",
-    "\n",
-    "# Map mix of ensembl IDs and gene symbols in the var_index to ensembl IDs\n",
-    "gene_mapper = bt.Gene.standardize(\n",
-    "    curate.non_validated[\"var_index\"],\n",
-    "    field=\"symbol\",\n",
-    "    return_field=\"ensembl_gene_id\",\n",
-    "    return_mapper=True,\n",
-    "    organism=\"human\",\n",
-    ")\n",
-    "adata.var.index = adata.var.index.map(lambda x: gene_mapper.get(x, x))\n",
-    "\n",
-    "adata.obs[\"disease\"] = adata.obs[\"disease\"].cat.rename_categories(\n",
-    "    {\n",
-    "        \"colon/colorectal cancer\": \"colorectal cancer\",\n",
-    "        \"rhabdoid\": \"rhabdoid tumor\",\n",
-    "        \"bladder cancer\": \"urinary bladder carcinoma\",\n",
-    "        \"endometrial/uterine cancer\": \"uterine corpus cancer\",\n",
-    "    }\n",
-    ")\n",
-    "\n",
-    "adata.obs[\"cell_line\"] = bt.CellLine.public(\n",
-    "    source=bt.Source.filter(name=\"depmap\").one()\n",
-    ").standardize(adata.obs[\"cell_line\"], field=\"name\")\n",
-    "bt.CellLine.public(source=bt.Source.filter(name=\"depmap\").one()).inspect(\n",
-    "    adata.obs[\"cell_line\"], field=\"name\"\n",
-    ")\n",
-    "\n",
-    "adata.obs[\"perturbation\"] = adata.obs[\"perturbation\"].cat.rename_categories(\n",
-    "    lambda category: category.lower()\n",
-    ")\n",
-    "adata.obs = adata.obs.rename(columns={\"DepMap_ID\": \"depmap_id\"})\n",
-    "\n",
-    "adata = adata[:, ~adata.var.index.isin(curate.non_validated[\"var_index\"])].copy()\n",
-    "```\n",
-    "\n",
-    ":::"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -183,6 +118,7 @@
     "    \"tissue_type\": ln.ULabel.name,\n",
     "}\n",
     "sources = {\n",
+    "    \"var_index\": bt.Source.filter(entity=\"bionty.Gene\", version=\"release-112\", organism=\"human\").one(),\n",
     "    \"depmap_id\": bt.Source.filter(name=\"depmap\").one(),\n",
     "    \"cell_line\": bt.Source.filter(name=\"depmap\").one(),\n",
     "}\n",
@@ -340,11 +276,12 @@
     "genetic_treatments = []\n",
     "for name, symbol, target_name in treatments:\n",
     "    treatment = wl.GeneticTreatment(system=\"CRISPR KO\", name=name).save()\n",
-    "    gene = (\n",
-    "        bt.Gene.from_source(symbol=symbol, organism=organism).save()\n",
-    "        if symbol != \"lacz\"\n",
-    "        else bt.Gene(symbol=symbol, organism=organism).save()\n",
-    "    )\n",
+    "    if symbol != \"lacz\":\n",
+    "        gene_result = bt.Gene.from_source(symbol=symbol, organism=organism)\n",
+    "        gene = gene_result[0] if isinstance(gene_result, list) else gene_result\n",
+    "        gene = gene.save()\n",
+    "    else:\n",
+    "        gene = bt.Gene(symbol=symbol, organism=organism).save()\n",
     "    target = wl.TreatmentTarget(name=target_name).save()\n",
     "    target.genes.add(gene)\n",
     "    treatment.targets.add(target)\n",