Skip to content

Commit

Permalink
Compute gene expression (#5)
Browse files Browse the repository at this point in the history
* updated to support run_azimuth

* azimuth implemented

* gene expression done

* gene expression done

* Finished up azimuth data download cwl/scripts

* Add reference data input

* Remove gene expression (not part of pr)

* Add draft gene expression code

* modified gene expr code

* dockerized gene expression

* refactored main.py

---------

Co-authored-by: Vicky Daiya <[email protected]>
  • Loading branch information
axdanbol and vickydaiya authored Oct 23, 2023
1 parent 314e2a3 commit dc754a8
Show file tree
Hide file tree
Showing 7 changed files with 207 additions and 3 deletions.
14 changes: 11 additions & 3 deletions .github/workflows/build-containers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Build Containers

on:
push:
branches: ['main']
branches: ["main"]

env:
REGISTRY: ghcr.io
Expand All @@ -15,8 +15,16 @@ jobs:
packages: write
strategy:
matrix:
container: [azimuth, celltypist, extract-summary, popv, preprocess]

container:
[
azimuth,
celltypist,
extract-summary,
gene-expression,
popv,
preprocess,
]

steps:
- name: Checkout repository
uses: actions/checkout@v2
Expand Down
8 changes: 8 additions & 0 deletions containers/gene-expression/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:3.10

COPY context/requirements-freeze.txt .
RUN pip install -r requirements-freeze.txt

COPY context/ .

CMD ["python", "/main.py"]
107 changes: 107 additions & 0 deletions containers/gene-expression/context/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import argparse
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
from pathlib import Path

MIN_CELLS_PER_CT = 2 # need a filter to reomve CTs with one cell as sc.tl.rank_genes_groups gives an error


def filter_matrix(matrix: anndata.AnnData, annotation_column: str):
"""filters an anndata matrix to only include cells which are annotated to a cell type with more than MIN_CELLS_PER_CT"""
ct_counts = matrix.obs[annotation_column].value_counts()
valid_cts = ct_counts.index[ct_counts >= MIN_CELLS_PER_CT]
mask = np.isin(matrix.obs[annotation_column], valid_cts)
return matrix[mask, :]


def format_marker_genes_df(df: pd.DataFrame, annotation_column: str):
"""formats the output from sc.tl.rank_genes_groups to a dataframe with celltype as one column and list of marker genes as other"""
df = df.transpose()
df["marker_genes"] = df.apply(lambda row: row.tolist(), axis=1)
df = df["marker_genes"].rename_axis(annotation_column).reset_index()
return df


def get_mean_expr_value(matrix, annotation_column, cell_type, gene):
"""gets the mean expression value for a cell type and a gene"""
cell_indices = [
matrix.obs.index.get_loc(cell_index)
for cell_index in matrix.obs[matrix.obs[annotation_column] == cell_type].index
]
mean_expr = matrix.X[cell_indices, matrix.var.index.get_loc(gene)].mean()
return mean_expr


def get_marker_genes_with_expr(matrix, annotation_column, cell_type, marker_genes):
"""gets the mean expression values for all marker genes"""
output = []
for gene in marker_genes:
output.append(
{
"gene_label": gene,
"mean_gene_expr_value": get_mean_expr_value(
matrix, annotation_column, cell_type, gene
),
}
)
return output


def get_gene_expr(
matrix: anndata.AnnData, annotation_column: str, gene_expr_column: str
):
"""gets the marker genes and mean expression values for all cells in the anndata matrix"""

matrix.raw = matrix # for getting gene names as output for sc.tl.rank_genes_groups
filtered_matrix = filter_matrix(matrix, annotation_column)
sc.tl.rank_genes_groups(filtered_matrix, groupby=annotation_column, n_genes=10)
ct_marker_genes_df = format_marker_genes_df(
pd.DataFrame(filtered_matrix.uns["rank_genes_groups"]["names"]),
annotation_column,
)

ct_marker_genes_df[gene_expr_column] = ct_marker_genes_df.apply(
lambda row: get_marker_genes_with_expr(
matrix, annotation_column, row[annotation_column], row["marker_genes"]
),
axis=1,
)
merged_obs = matrix.obs.merge(
ct_marker_genes_df[[annotation_column, gene_expr_column]], how="left"
)
merged_obs[gene_expr_column] = merged_obs[gene_expr_column].astype(str)
merged_obs.index = matrix.obs.index
matrix.obs = merged_obs
return matrix


def main(args: argparse.Namespace):
matrix = get_gene_expr(args.matrix, args.annotation_column, args.gene_expr_column)
matrix.write_h5ad(args.output_matrix)


def _get_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Add gene expressions to h5ad data")
parser.add_argument("matrix", type=anndata.read_h5ad, help="h5ad data file")
parser.add_argument(
"--annotation-column", required=True, help="Column with annotations"
)
parser.add_argument(
"--gene-expr-column", default="gene_expr", help="Column for gene_expr"
)
parser.add_argument(
"--output-matrix",
type=Path,
default="matrix_with_gene_expr.h5ad",
help="matrix with gene expressions output path",
)

return parser


if __name__ == "__main__":
parser = _get_arg_parser()
args = parser.parse_args()
main(args)
35 changes: 35 additions & 0 deletions containers/gene-expression/context/requirements-freeze.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
anndata==0.10.2
array-api-compat==1.4
contourpy==1.1.1
cycler==0.12.1
fonttools==4.43.1
h5py==3.10.0
joblib==1.3.2
kiwisolver==1.4.5
llvmlite==0.41.1
matplotlib==3.8.0
natsort==8.4.0
networkx==3.2
numba==0.58.1
numpy==1.26.1
packaging==23.2
pandas==2.1.1
pathlib==1.0.1
patsy==0.5.3
Pillow==10.1.0
pynndescent==0.5.10
pyparsing==3.1.1
python-dateutil==2.8.2
pytz==2023.3.post1
scanpy==1.9.5
scikit-learn==1.3.2
scipy==1.11.3
seaborn==0.13.0
session-info==1.0.0
six==1.16.0
statsmodels==0.14.0
stdlib-list==0.9.0
threadpoolctl==3.2.0
tqdm==4.66.1
tzdata==2023.3
umap-learn==0.5.4
4 changes: 4 additions & 0 deletions containers/gene-expression/context/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
anndata
numpy
scanpy
pandas
14 changes: 14 additions & 0 deletions containers/gene-expression/options.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
type: record
name: options
label: Gene expression options
fields:
annotationColumn:
type: string
label: Annotation column
inputBinding:
prefix: --annotation-column
geneExprColumn:
type: string
label: Gene expression column
inputBinding:
prefix: --gene-expr-column
28 changes: 28 additions & 0 deletions containers/gene-expression/pipeline.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env cwl-runner
class: CommandLineTool
cwlVersion: v1.0

requirements:
DockerRequirement:
dockerPull: ghcr.io/hubmapconsortium/hra-workflows/gene-expression:main
SchemaDefRequirement:
types:
- $import: ./options.yml

baseCommand: python
arguments:
- /main.py

inputs:
matrix:
type: File
label: Data to get gene expression for in h5ad format
inputBinding:
position: 0
options: ./options.yml#options

outputs:
matrix_with_gene_expr:
type: File
outputBinding:
glob: matrix_with_gene_expr.h5ad

0 comments on commit dc754a8

Please sign in to comment.