Compute gene expression (#5)

* updated to support run_azimuth * azimuth implemented * gene expression done * gene expression done * Finished up azimuth data download cwl/scripts * Add reference data input * Remove gene expression (not part of pr) * Add draft gene expression code * modified gene expr code * dockerized gene expression * refactored main.py --------- Co-authored-by: Vicky Daiya <[email protected]>
hubmapconsortium · Oct 23, 2023 · dc754a8 · dc754a8
1 parent 314e2a3
commit dc754a8
Show file tree

Hide file tree

Showing 7 changed files with 207 additions and 3 deletions.
diff --git a/.github/workflows/build-containers.yml b/.github/workflows/build-containers.yml
@@ -2,7 +2,7 @@ name: Build Containers
 
 on:
   push:
-    branches: ['main']
+    branches: ["main"]
 
 env:
   REGISTRY: ghcr.io
@@ -15,8 +15,16 @@ jobs:
       packages: write
     strategy:
       matrix:
-        container: [azimuth, celltypist, extract-summary, popv, preprocess]
-
+        container:
+          [
+            azimuth,
+            celltypist,
+            extract-summary,
+            gene-expression,
+            popv,
+            preprocess,
+          ]
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2

diff --git a/containers/gene-expression/Dockerfile b/containers/gene-expression/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.10
+
+COPY context/requirements-freeze.txt .
+RUN pip install -r requirements-freeze.txt
+
+COPY context/ .
+
+CMD ["python", "/main.py"]
diff --git a/containers/gene-expression/context/main.py b/containers/gene-expression/context/main.py
@@ -0,0 +1,107 @@
+import argparse
+import anndata
+import numpy as np
+import scanpy as sc
+import pandas as pd
+from pathlib import Path
+
+MIN_CELLS_PER_CT = 2  # need a filter to reomve CTs with one cell as sc.tl.rank_genes_groups gives an error
+
+
+def filter_matrix(matrix: anndata.AnnData, annotation_column: str):
+    """filters an anndata matrix to only include cells which are annotated to a cell type with more than MIN_CELLS_PER_CT"""
+    ct_counts = matrix.obs[annotation_column].value_counts()
+    valid_cts = ct_counts.index[ct_counts >= MIN_CELLS_PER_CT]
+    mask = np.isin(matrix.obs[annotation_column], valid_cts)
+    return matrix[mask, :]
+
+
+def format_marker_genes_df(df: pd.DataFrame, annotation_column: str):
+    """formats the output from sc.tl.rank_genes_groups to a dataframe with celltype as one column and list of marker genes as other"""
+    df = df.transpose()
+    df["marker_genes"] = df.apply(lambda row: row.tolist(), axis=1)
+    df = df["marker_genes"].rename_axis(annotation_column).reset_index()
+    return df
+
+
+def get_mean_expr_value(matrix, annotation_column, cell_type, gene):
+    """gets the mean expression value for a cell type and a gene"""
+    cell_indices = [
+        matrix.obs.index.get_loc(cell_index)
+        for cell_index in matrix.obs[matrix.obs[annotation_column] == cell_type].index
+    ]
+    mean_expr = matrix.X[cell_indices, matrix.var.index.get_loc(gene)].mean()
+    return mean_expr
+
+
+def get_marker_genes_with_expr(matrix, annotation_column, cell_type, marker_genes):
+    """gets the mean expression values for all marker genes"""
+    output = []
+    for gene in marker_genes:
+        output.append(
+            {
+                "gene_label": gene,
+                "mean_gene_expr_value": get_mean_expr_value(
+                    matrix, annotation_column, cell_type, gene
+                ),
+            }
+        )
+    return output
+
+
+def get_gene_expr(
+    matrix: anndata.AnnData, annotation_column: str, gene_expr_column: str
+):
+    """gets the marker genes and mean expression values for all cells in the anndata matrix"""
+
+    matrix.raw = matrix  # for getting gene names as output for sc.tl.rank_genes_groups
+    filtered_matrix = filter_matrix(matrix, annotation_column)
+    sc.tl.rank_genes_groups(filtered_matrix, groupby=annotation_column, n_genes=10)
+    ct_marker_genes_df = format_marker_genes_df(
+        pd.DataFrame(filtered_matrix.uns["rank_genes_groups"]["names"]),
+        annotation_column,
+    )
+
+    ct_marker_genes_df[gene_expr_column] = ct_marker_genes_df.apply(
+        lambda row: get_marker_genes_with_expr(
+            matrix, annotation_column, row[annotation_column], row["marker_genes"]
+        ),
+        axis=1,
+    )
+    merged_obs = matrix.obs.merge(
+        ct_marker_genes_df[[annotation_column, gene_expr_column]], how="left"
+    )
+    merged_obs[gene_expr_column] = merged_obs[gene_expr_column].astype(str)
+    merged_obs.index = matrix.obs.index
+    matrix.obs = merged_obs
+    return matrix
+
+
+def main(args: argparse.Namespace):
+    matrix = get_gene_expr(args.matrix, args.annotation_column, args.gene_expr_column)
+    matrix.write_h5ad(args.output_matrix)
+
+
+def _get_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Add gene expressions to h5ad data")
+    parser.add_argument("matrix", type=anndata.read_h5ad, help="h5ad data file")
+    parser.add_argument(
+        "--annotation-column", required=True, help="Column with annotations"
+    )
+    parser.add_argument(
+        "--gene-expr-column", default="gene_expr", help="Column for gene_expr"
+    )
+    parser.add_argument(
+        "--output-matrix",
+        type=Path,
+        default="matrix_with_gene_expr.h5ad",
+        help="matrix with gene expressions output path",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = _get_arg_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/containers/gene-expression/context/requirements-freeze.txt b/containers/gene-expression/context/requirements-freeze.txt
@@ -0,0 +1,35 @@
+anndata==0.10.2
+array-api-compat==1.4
+contourpy==1.1.1
+cycler==0.12.1
+fonttools==4.43.1
+h5py==3.10.0
+joblib==1.3.2
+kiwisolver==1.4.5
+llvmlite==0.41.1
+matplotlib==3.8.0
+natsort==8.4.0
+networkx==3.2
+numba==0.58.1
+numpy==1.26.1
+packaging==23.2
+pandas==2.1.1
+pathlib==1.0.1
+patsy==0.5.3
+Pillow==10.1.0
+pynndescent==0.5.10
+pyparsing==3.1.1
+python-dateutil==2.8.2
+pytz==2023.3.post1
+scanpy==1.9.5
+scikit-learn==1.3.2
+scipy==1.11.3
+seaborn==0.13.0
+session-info==1.0.0
+six==1.16.0
+statsmodels==0.14.0
+stdlib-list==0.9.0
+threadpoolctl==3.2.0
+tqdm==4.66.1
+tzdata==2023.3
+umap-learn==0.5.4
diff --git a/containers/gene-expression/context/requirements.txt b/containers/gene-expression/context/requirements.txt
@@ -0,0 +1,4 @@
+anndata
+numpy
+scanpy
+pandas
diff --git a/containers/gene-expression/options.yml b/containers/gene-expression/options.yml
@@ -0,0 +1,14 @@
+type: record
+name: options
+label: Gene expression options
+fields:
+  annotationColumn:
+    type: string
+    label: Annotation column
+    inputBinding:
+      prefix: --annotation-column
+  geneExprColumn:
+    type: string
+    label: Gene expression column
+    inputBinding:
+      prefix: --gene-expr-column
diff --git a/containers/gene-expression/pipeline.cwl b/containers/gene-expression/pipeline.cwl
@@ -0,0 +1,28 @@
+#!/usr/bin/env cwl-runner
+class: CommandLineTool
+cwlVersion: v1.0
+
+requirements:
+  DockerRequirement:
+    dockerPull: ghcr.io/hubmapconsortium/hra-workflows/gene-expression:main
+  SchemaDefRequirement:
+    types:
+      - $import: ./options.yml
+
+baseCommand: python
+arguments:
+  - /main.py
+
+inputs:
+  matrix:
+    type: File
+    label: Data to get gene expression for in h5ad format
+    inputBinding:
+      position: 0
+  options: ./options.yml#options
+
+outputs:
+  matrix_with_gene_expr:
+    type: File
+    outputBinding:
+      glob: matrix_with_gene_expr.h5ad
-Original file line number
+Diff line change
@@ -0,0 +1,4 @@
+    anndata
+    numpy
+    scanpy
+    pandas