Skip to content

Commit

Permalink
added preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
matin authored and matin committed Jul 24, 2024
1 parent 75ba304 commit aeb4073
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 1,899 deletions.
182 changes: 182 additions & 0 deletions notebooks/preprocess.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/sc_counts.h5ad ./resources_raw/ --no-sign-request\n",
"!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/multiome_counts.h5mu ./resources_raw/ --no-sign-request"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install sctk, anndata\n",
"\n",
"import anndata as ad \n",
"import pandas as pd\n",
"import numpy as np\n",
"import sctk\n",
"par = {\n",
" 'sc_counts': '../resources_raw/sc_counts.h5ad',\n",
" 'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n",
"}\n",
"def preprocess_sc(par):\n",
" # clean up\n",
" sc_counts = ad.read_h5ad(par['sc_counts'])\n",
" sc_counts.obs = sc_counts.obs[['well', 'row', 'col', 'plate_name', 'cell_type', 'donor_id']]\n",
" sc_counts.X = sc_counts.layers['counts']\n",
" del sc_counts.layers \n",
" del sc_counts.obsm \n",
" sc_counts.var_names_make_unique()\n",
" sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_name'].astype('str') \\\n",
" + '_' + sc_counts.obs['well'].astype('str') \\\n",
" + '_' + sc_counts.obs['cell_type'].astype('str')\n",
" sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_well_cell_type'].astype('category')\n",
"\n",
" # merge cell types\n",
" CELL_TYPES = ['NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells', 'B cells', 'Myeloid cells']\n",
" T_cell_types = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']\n",
" cell_type_map = {cell_type: 'T cells' if cell_type in T_cell_types else cell_type for cell_type in CELL_TYPES}\n",
" sc_counts.obs['cell_type'] = sc_counts.obs['cell_type'].map(cell_type_map)\n",
" sc_counts.obs['cell_type'].unique()\n",
"\n",
" # qc \n",
" sctk.calculate_qc(sc_counts)\n",
" sctk.cellwise_qc(sc_counts)\n",
"\n",
" # filtering\n",
" # cell wise\n",
" filter_percent_hb = sc_counts.obs.percent_hb>.2\n",
" filter_percent_hb.sum()\n",
" # gene wise\n",
" plates = sc_counts.obs['plate_name'].unique()\n",
"\n",
" # Step 2: Initialize a DataFrame to store counts\n",
" gene_counts_per_plate = pd.DataFrame(index=sc_counts.var_names, columns=plates, dtype=int)\n",
"\n",
" # Step 3: Iterate over each plate and calculate expression counts\n",
" for plate in plates:\n",
" # Subset the AnnData object for the current plate\n",
" subset = sc_counts[sc_counts.obs['plate_name'] == plate]\n",
"\n",
" # Calculate expression counts (genes x cells > 0)\n",
" expressed_genes = (subset.X > 0).sum(axis=0)\n",
"\n",
" # Check if the result needs conversion from sparse matrix format\n",
" if isinstance(expressed_genes, np.matrix):\n",
" expressed_genes = np.array(expressed_genes).flatten()\n",
"\n",
" # Store the counts in the DataFrame\n",
" gene_counts_per_plate[plate] = expressed_genes\n",
"\n",
" # Step 4: Aggregate counts across plates (max or sum based on the requirement)\n",
" # We use `max` here to find if any gene meets the criteria in at least one plate\n",
" max_counts = gene_counts_per_plate.max(axis=1)\n",
"\n",
" # Step 5: Create a mask for genes to keep (genes expressed in at least 100 cells in any plate)\n",
" genes_to_keep = max_counts >= 100\n",
" print('retained genes:', genes_to_keep.sum())\n",
" # actual filtering\n",
" sc_counts = sc_counts[(~filter_percent_hb), genes_to_keep]\n",
" # clean\n",
" sc_counts.obs = sc_counts.obs[['cell_type', 'sm_name', 'donor_id', 'row', 'plate_name', 'well']]\n",
" sc_counts.var = sc_counts.var[[]]\n",
"\n",
" del sc_counts.obsm\n",
" del sc_counts.uns\n",
"\n",
" sc_counts.write(par['sc_counts_filtered'])\n",
"preprocess_sc(par)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import anndata as ad \n",
"from scipy import sparse\n",
"par = {\n",
" 'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n",
" 'pseudobulked': '../resources_raw/pseudobulked.h5ad',\n",
"}\n",
"def sum_by(adata: ad.AnnData, col: str) -> ad.AnnData:\n",
" \"\"\"\n",
" Adapted from this forum post: \n",
" https://discourse.scverse.org/t/group-sum-rows-based-on-jobs-feature/371/4\n",
" \"\"\"\n",
" \n",
" assert pd.api.types.is_categorical_dtype(adata.obs[col])\n",
"\n",
" # sum `.X` entries for each unique value in `col`\n",
" cat = adata.obs[col].values\n",
"\n",
" indicator = sparse.coo_matrix(\n",
" (\n",
" np.broadcast_to(True, adata.n_obs),\n",
" (cat.codes, np.arange(adata.n_obs))\n",
" ),\n",
" shape=(len(cat.categories), adata.n_obs),\n",
" )\n",
" \n",
" sum_adata = ad.AnnData(\n",
" indicator @ adata.X,\n",
" var=adata.var,\n",
" obs=pd.DataFrame(index=cat.categories),\n",
" )\n",
" \n",
" # copy over `.obs` values that have a one-to-one-mapping with `.obs[col]`\n",
" obs_cols = adata.obs.columns\n",
" obs_cols = list(set(adata.obs.columns) - set([col]))\n",
" \n",
" one_to_one_mapped_obs_cols = []\n",
" nunique_in_col = adata.obs[col].nunique()\n",
" for other_col in obs_cols:\n",
" if len(adata.obs[[col, other_col]].drop_duplicates()) == nunique_in_col:\n",
" one_to_one_mapped_obs_cols.append(other_col)\n",
"\n",
" joining_df = adata.obs[[col] + one_to_one_mapped_obs_cols].drop_duplicates().set_index(col)\n",
" assert (sum_adata.obs.index == sum_adata.obs.join(joining_df).index).all()\n",
" sum_adata.obs = sum_adata.obs.join(joining_df)\n",
" sum_adata.obs.index.name = col\n",
" sum_adata.obs = sum_adata.obs.reset_index()\n",
" sum_adata.obs.index = sum_adata.obs.index.astype('str')\n",
"\n",
" return sum_adata\n",
"def pseudobulked_and_filter(par):\n",
" # pseudobulk\n",
" sc_counts = ad.read_h5ad(par['sc_counts_filtered'])\n",
" bulk_adata = sum_by(sc_counts, 'plate_well_cell_type')\n",
" bulk_adata.obs['cell_count'] = sc_counts.obs.groupby('plate_well_cell_type').size().values\n",
" bulk_adata.X = np.array(bulk_adata.X.todense())\n",
"\n",
" print('ratio of missingness' , (bulk_adata.X==0).sum()/bulk_adata.X.size)\n",
" bulk_adata.var = bulk_adata.var.reset_index()\n",
" bulk_adata.var.set_index('index', inplace=True)\n",
"\n",
" bulk_adata.X = np.nan_to_num(bulk_adata.X, nan=0)\n",
"\n",
" \n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit aeb4073

Please sign in to comment.