added preprocess

openproblems-bio · Jul 24, 2024 · aeb4073 · aeb4073
1 parent 75ba304
commit aeb4073
Show file tree

Hide file tree

Showing 3 changed files with 184 additions and 1,899 deletions.
diff --git a/notebooks/preprocess.ipynb b/notebooks/preprocess.ipynb
@@ -0,0 +1,182 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/sc_counts.h5ad  ./resources_raw/ --no-sign-request\n",
+    "!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/multiome_counts.h5mu ./resources_raw/ --no-sign-request"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install sctk, anndata\n",
+    "\n",
+    "import anndata as ad \n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import sctk\n",
+    "par = {\n",
+    "    'sc_counts': '../resources_raw/sc_counts.h5ad',\n",
+    "    'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n",
+    "}\n",
+    "def preprocess_sc(par):\n",
+    "    # clean up\n",
+    "    sc_counts = ad.read_h5ad(par['sc_counts'])\n",
+    "    sc_counts.obs = sc_counts.obs[['well', 'row', 'col', 'plate_name', 'cell_type', 'donor_id']]\n",
+    "    sc_counts.X = sc_counts.layers['counts']\n",
+    "    del sc_counts.layers \n",
+    "    del sc_counts.obsm \n",
+    "    sc_counts.var_names_make_unique()\n",
+    "    sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_name'].astype('str') \\\n",
+    "        + '_' + sc_counts.obs['well'].astype('str') \\\n",
+    "        + '_' + sc_counts.obs['cell_type'].astype('str')\n",
+    "    sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_well_cell_type'].astype('category')\n",
+    "\n",
+    "    # merge cell types\n",
+    "    CELL_TYPES = ['NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells', 'B cells', 'Myeloid cells']\n",
+    "    T_cell_types = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']\n",
+    "    cell_type_map = {cell_type: 'T cells' if cell_type in T_cell_types else cell_type for cell_type in CELL_TYPES}\n",
+    "    sc_counts.obs['cell_type'] = sc_counts.obs['cell_type'].map(cell_type_map)\n",
+    "    sc_counts.obs['cell_type'].unique()\n",
+    "\n",
+    "    # qc \n",
+    "    sctk.calculate_qc(sc_counts)\n",
+    "    sctk.cellwise_qc(sc_counts)\n",
+    "\n",
+    "    # filtering\n",
+    "    # cell wise\n",
+    "    filter_percent_hb = sc_counts.obs.percent_hb>.2\n",
+    "    filter_percent_hb.sum()\n",
+    "    # gene wise\n",
+    "    plates = sc_counts.obs['plate_name'].unique()\n",
+    "\n",
+    "    # Step 2: Initialize a DataFrame to store counts\n",
+    "    gene_counts_per_plate = pd.DataFrame(index=sc_counts.var_names, columns=plates, dtype=int)\n",
+    "\n",
+    "    # Step 3: Iterate over each plate and calculate expression counts\n",
+    "    for plate in plates:\n",
+    "        # Subset the AnnData object for the current plate\n",
+    "        subset = sc_counts[sc_counts.obs['plate_name'] == plate]\n",
+    "\n",
+    "        # Calculate expression counts (genes x cells > 0)\n",
+    "        expressed_genes = (subset.X > 0).sum(axis=0)\n",
+    "\n",
+    "        # Check if the result needs conversion from sparse matrix format\n",
+    "        if isinstance(expressed_genes, np.matrix):\n",
+    "            expressed_genes = np.array(expressed_genes).flatten()\n",
+    "\n",
+    "        # Store the counts in the DataFrame\n",
+    "        gene_counts_per_plate[plate] = expressed_genes\n",
+    "\n",
+    "    # Step 4: Aggregate counts across plates (max or sum based on the requirement)\n",
+    "    # We use `max` here to find if any gene meets the criteria in at least one plate\n",
+    "    max_counts = gene_counts_per_plate.max(axis=1)\n",
+    "\n",
+    "    # Step 5: Create a mask for genes to keep (genes expressed in at least 100 cells in any plate)\n",
+    "    genes_to_keep = max_counts >= 100\n",
+    "    print('retained genes:', genes_to_keep.sum())\n",
+    "    # actual filtering\n",
+    "    sc_counts = sc_counts[(~filter_percent_hb), genes_to_keep]\n",
+    "    # clean\n",
+    "    sc_counts.obs = sc_counts.obs[['cell_type', 'sm_name', 'donor_id', 'row', 'plate_name', 'well']]\n",
+    "    sc_counts.var = sc_counts.var[[]]\n",
+    "\n",
+    "    del sc_counts.obsm\n",
+    "    del sc_counts.uns\n",
+    "\n",
+    "    sc_counts.write(par['sc_counts_filtered'])\n",
+    "preprocess_sc(par)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import anndata as ad \n",
+    "from scipy import sparse\n",
+    "par = {\n",
+    "    'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n",
+    "    'pseudobulked': '../resources_raw/pseudobulked.h5ad',\n",
+    "}\n",
+    "def sum_by(adata: ad.AnnData, col: str) -> ad.AnnData:\n",
+    "    \"\"\"\n",
+    "    Adapted from this forum post: \n",
+    "    https://discourse.scverse.org/t/group-sum-rows-based-on-jobs-feature/371/4\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    assert pd.api.types.is_categorical_dtype(adata.obs[col])\n",
+    "\n",
+    "    # sum `.X` entries for each unique value in `col`\n",
+    "    cat = adata.obs[col].values\n",
+    "\n",
+    "    indicator = sparse.coo_matrix(\n",
+    "        (\n",
+    "            np.broadcast_to(True, adata.n_obs),\n",
+    "            (cat.codes, np.arange(adata.n_obs))\n",
+    "        ),\n",
+    "        shape=(len(cat.categories), adata.n_obs),\n",
+    "    )\n",
+    "  \n",
+    "    sum_adata = ad.AnnData(\n",
+    "        indicator @ adata.X,\n",
+    "        var=adata.var,\n",
+    "        obs=pd.DataFrame(index=cat.categories),\n",
+    "    )\n",
+    "    \n",
+    "    # copy over `.obs` values that have a one-to-one-mapping with `.obs[col]`\n",
+    "    obs_cols = adata.obs.columns\n",
+    "    obs_cols = list(set(adata.obs.columns) - set([col]))\n",
+    "    \n",
+    "    one_to_one_mapped_obs_cols = []\n",
+    "    nunique_in_col = adata.obs[col].nunique()\n",
+    "    for other_col in obs_cols:\n",
+    "        if len(adata.obs[[col, other_col]].drop_duplicates()) == nunique_in_col:\n",
+    "            one_to_one_mapped_obs_cols.append(other_col)\n",
+    "\n",
+    "    joining_df = adata.obs[[col] + one_to_one_mapped_obs_cols].drop_duplicates().set_index(col)\n",
+    "    assert (sum_adata.obs.index == sum_adata.obs.join(joining_df).index).all()\n",
+    "    sum_adata.obs = sum_adata.obs.join(joining_df)\n",
+    "    sum_adata.obs.index.name = col\n",
+    "    sum_adata.obs = sum_adata.obs.reset_index()\n",
+    "    sum_adata.obs.index = sum_adata.obs.index.astype('str')\n",
+    "\n",
+    "    return sum_adata\n",
+    "def pseudobulked_and_filter(par):\n",
+    "    # pseudobulk\n",
+    "    sc_counts = ad.read_h5ad(par['sc_counts_filtered'])\n",
+    "    bulk_adata = sum_by(sc_counts, 'plate_well_cell_type')\n",
+    "    bulk_adata.obs['cell_count'] = sc_counts.obs.groupby('plate_well_cell_type').size().values\n",
+    "    bulk_adata.X = np.array(bulk_adata.X.todense())\n",
+    "\n",
+    "    print('ratio of missingness' , (bulk_adata.X==0).sum()/bulk_adata.X.size)\n",
+    "    bulk_adata.var = bulk_adata.var.reset_index()\n",
+    "    bulk_adata.var.set_index('index', inplace=True)\n",
+    "\n",
+    "    bulk_adata.X = np.nan_to_num(bulk_adata.X, nan=0)\n",
+    "\n",
+    "    \n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}