-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
matin
authored and
matin
committed
Jul 24, 2024
1 parent
75ba304
commit aeb4073
Showing
3 changed files
with
184 additions
and
1,899 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/sc_counts.h5ad ./resources_raw/ --no-sign-request\n", | ||
"!aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/multiome_counts.h5mu ./resources_raw/ --no-sign-request" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# !pip install sctk, anndata\n", | ||
"\n", | ||
"import anndata as ad \n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import sctk\n", | ||
"par = {\n", | ||
" 'sc_counts': '../resources_raw/sc_counts.h5ad',\n", | ||
" 'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n", | ||
"}\n", | ||
"def preprocess_sc(par):\n", | ||
" # clean up\n", | ||
" sc_counts = ad.read_h5ad(par['sc_counts'])\n", | ||
" sc_counts.obs = sc_counts.obs[['well', 'row', 'col', 'plate_name', 'cell_type', 'donor_id']]\n", | ||
" sc_counts.X = sc_counts.layers['counts']\n", | ||
" del sc_counts.layers \n", | ||
" del sc_counts.obsm \n", | ||
" sc_counts.var_names_make_unique()\n", | ||
" sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_name'].astype('str') \\\n", | ||
" + '_' + sc_counts.obs['well'].astype('str') \\\n", | ||
" + '_' + sc_counts.obs['cell_type'].astype('str')\n", | ||
" sc_counts.obs['plate_well_cell_type'] = sc_counts.obs['plate_well_cell_type'].astype('category')\n", | ||
"\n", | ||
" # merge cell types\n", | ||
" CELL_TYPES = ['NK cells', 'T cells CD4+', 'T cells CD8+', 'T regulatory cells', 'B cells', 'Myeloid cells']\n", | ||
" T_cell_types = ['T regulatory cells', 'T cells CD8+', 'T cells CD4+']\n", | ||
" cell_type_map = {cell_type: 'T cells' if cell_type in T_cell_types else cell_type for cell_type in CELL_TYPES}\n", | ||
" sc_counts.obs['cell_type'] = sc_counts.obs['cell_type'].map(cell_type_map)\n", | ||
" sc_counts.obs['cell_type'].unique()\n", | ||
"\n", | ||
" # qc \n", | ||
" sctk.calculate_qc(sc_counts)\n", | ||
" sctk.cellwise_qc(sc_counts)\n", | ||
"\n", | ||
" # filtering\n", | ||
" # cell wise\n", | ||
" filter_percent_hb = sc_counts.obs.percent_hb>.2\n", | ||
" filter_percent_hb.sum()\n", | ||
" # gene wise\n", | ||
" plates = sc_counts.obs['plate_name'].unique()\n", | ||
"\n", | ||
" # Step 2: Initialize a DataFrame to store counts\n", | ||
" gene_counts_per_plate = pd.DataFrame(index=sc_counts.var_names, columns=plates, dtype=int)\n", | ||
"\n", | ||
" # Step 3: Iterate over each plate and calculate expression counts\n", | ||
" for plate in plates:\n", | ||
" # Subset the AnnData object for the current plate\n", | ||
" subset = sc_counts[sc_counts.obs['plate_name'] == plate]\n", | ||
"\n", | ||
" # Calculate expression counts (genes x cells > 0)\n", | ||
" expressed_genes = (subset.X > 0).sum(axis=0)\n", | ||
"\n", | ||
" # Check if the result needs conversion from sparse matrix format\n", | ||
" if isinstance(expressed_genes, np.matrix):\n", | ||
" expressed_genes = np.array(expressed_genes).flatten()\n", | ||
"\n", | ||
" # Store the counts in the DataFrame\n", | ||
" gene_counts_per_plate[plate] = expressed_genes\n", | ||
"\n", | ||
" # Step 4: Aggregate counts across plates (max or sum based on the requirement)\n", | ||
" # We use `max` here to find if any gene meets the criteria in at least one plate\n", | ||
" max_counts = gene_counts_per_plate.max(axis=1)\n", | ||
"\n", | ||
" # Step 5: Create a mask for genes to keep (genes expressed in at least 100 cells in any plate)\n", | ||
" genes_to_keep = max_counts >= 100\n", | ||
" print('retained genes:', genes_to_keep.sum())\n", | ||
" # actual filtering\n", | ||
" sc_counts = sc_counts[(~filter_percent_hb), genes_to_keep]\n", | ||
" # clean\n", | ||
" sc_counts.obs = sc_counts.obs[['cell_type', 'sm_name', 'donor_id', 'row', 'plate_name', 'well']]\n", | ||
" sc_counts.var = sc_counts.var[[]]\n", | ||
"\n", | ||
" del sc_counts.obsm\n", | ||
" del sc_counts.uns\n", | ||
"\n", | ||
" sc_counts.write(par['sc_counts_filtered'])\n", | ||
"preprocess_sc(par)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import anndata as ad \n", | ||
"from scipy import sparse\n", | ||
"par = {\n", | ||
" 'sc_counts_filtered': '../resources_raw/sc_counts_filtered.h5ad',\n", | ||
" 'pseudobulked': '../resources_raw/pseudobulked.h5ad',\n", | ||
"}\n", | ||
"def sum_by(adata: ad.AnnData, col: str) -> ad.AnnData:\n", | ||
" \"\"\"\n", | ||
" Adapted from this forum post: \n", | ||
" https://discourse.scverse.org/t/group-sum-rows-based-on-jobs-feature/371/4\n", | ||
" \"\"\"\n", | ||
" \n", | ||
" assert pd.api.types.is_categorical_dtype(adata.obs[col])\n", | ||
"\n", | ||
" # sum `.X` entries for each unique value in `col`\n", | ||
" cat = adata.obs[col].values\n", | ||
"\n", | ||
" indicator = sparse.coo_matrix(\n", | ||
" (\n", | ||
" np.broadcast_to(True, adata.n_obs),\n", | ||
" (cat.codes, np.arange(adata.n_obs))\n", | ||
" ),\n", | ||
" shape=(len(cat.categories), adata.n_obs),\n", | ||
" )\n", | ||
" \n", | ||
" sum_adata = ad.AnnData(\n", | ||
" indicator @ adata.X,\n", | ||
" var=adata.var,\n", | ||
" obs=pd.DataFrame(index=cat.categories),\n", | ||
" )\n", | ||
" \n", | ||
" # copy over `.obs` values that have a one-to-one-mapping with `.obs[col]`\n", | ||
" obs_cols = adata.obs.columns\n", | ||
" obs_cols = list(set(adata.obs.columns) - set([col]))\n", | ||
" \n", | ||
" one_to_one_mapped_obs_cols = []\n", | ||
" nunique_in_col = adata.obs[col].nunique()\n", | ||
" for other_col in obs_cols:\n", | ||
" if len(adata.obs[[col, other_col]].drop_duplicates()) == nunique_in_col:\n", | ||
" one_to_one_mapped_obs_cols.append(other_col)\n", | ||
"\n", | ||
" joining_df = adata.obs[[col] + one_to_one_mapped_obs_cols].drop_duplicates().set_index(col)\n", | ||
" assert (sum_adata.obs.index == sum_adata.obs.join(joining_df).index).all()\n", | ||
" sum_adata.obs = sum_adata.obs.join(joining_df)\n", | ||
" sum_adata.obs.index.name = col\n", | ||
" sum_adata.obs = sum_adata.obs.reset_index()\n", | ||
" sum_adata.obs.index = sum_adata.obs.index.astype('str')\n", | ||
"\n", | ||
" return sum_adata\n", | ||
"def pseudobulked_and_filter(par):\n", | ||
" # pseudobulk\n", | ||
" sc_counts = ad.read_h5ad(par['sc_counts_filtered'])\n", | ||
" bulk_adata = sum_by(sc_counts, 'plate_well_cell_type')\n", | ||
" bulk_adata.obs['cell_count'] = sc_counts.obs.groupby('plate_well_cell_type').size().values\n", | ||
" bulk_adata.X = np.array(bulk_adata.X.todense())\n", | ||
"\n", | ||
" print('ratio of missingness' , (bulk_adata.X==0).sum()/bulk_adata.X.size)\n", | ||
" bulk_adata.var = bulk_adata.var.reset_index()\n", | ||
" bulk_adata.var.set_index('index', inplace=True)\n", | ||
"\n", | ||
" bulk_adata.X = np.nan_to_num(bulk_adata.X, nan=0)\n", | ||
"\n", | ||
" \n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.