-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* updated to support run_azimuth * azimuth implemented * gene expression done * gene expression done * Finished up azimuth data download cwl/scripts * Add reference data input * Remove gene expression (not part of pr) * Add draft gene expression code * modified gene expr code * dockerized gene expression * refactored main.py --------- Co-authored-by: Vicky Daiya <[email protected]>
- Loading branch information
1 parent
314e2a3
commit dc754a8
Showing
7 changed files
with
207 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
FROM python:3.10 | ||
|
||
COPY context/requirements-freeze.txt . | ||
RUN pip install -r requirements-freeze.txt | ||
|
||
COPY context/ . | ||
|
||
CMD ["python", "/main.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import argparse | ||
import anndata | ||
import numpy as np | ||
import scanpy as sc | ||
import pandas as pd | ||
from pathlib import Path | ||
|
||
MIN_CELLS_PER_CT = 2 # need a filter to reomve CTs with one cell as sc.tl.rank_genes_groups gives an error | ||
|
||
|
||
def filter_matrix(matrix: anndata.AnnData, annotation_column: str): | ||
"""filters an anndata matrix to only include cells which are annotated to a cell type with more than MIN_CELLS_PER_CT""" | ||
ct_counts = matrix.obs[annotation_column].value_counts() | ||
valid_cts = ct_counts.index[ct_counts >= MIN_CELLS_PER_CT] | ||
mask = np.isin(matrix.obs[annotation_column], valid_cts) | ||
return matrix[mask, :] | ||
|
||
|
||
def format_marker_genes_df(df: pd.DataFrame, annotation_column: str): | ||
"""formats the output from sc.tl.rank_genes_groups to a dataframe with celltype as one column and list of marker genes as other""" | ||
df = df.transpose() | ||
df["marker_genes"] = df.apply(lambda row: row.tolist(), axis=1) | ||
df = df["marker_genes"].rename_axis(annotation_column).reset_index() | ||
return df | ||
|
||
|
||
def get_mean_expr_value(matrix, annotation_column, cell_type, gene): | ||
"""gets the mean expression value for a cell type and a gene""" | ||
cell_indices = [ | ||
matrix.obs.index.get_loc(cell_index) | ||
for cell_index in matrix.obs[matrix.obs[annotation_column] == cell_type].index | ||
] | ||
mean_expr = matrix.X[cell_indices, matrix.var.index.get_loc(gene)].mean() | ||
return mean_expr | ||
|
||
|
||
def get_marker_genes_with_expr(matrix, annotation_column, cell_type, marker_genes): | ||
"""gets the mean expression values for all marker genes""" | ||
output = [] | ||
for gene in marker_genes: | ||
output.append( | ||
{ | ||
"gene_label": gene, | ||
"mean_gene_expr_value": get_mean_expr_value( | ||
matrix, annotation_column, cell_type, gene | ||
), | ||
} | ||
) | ||
return output | ||
|
||
|
||
def get_gene_expr( | ||
matrix: anndata.AnnData, annotation_column: str, gene_expr_column: str | ||
): | ||
"""gets the marker genes and mean expression values for all cells in the anndata matrix""" | ||
|
||
matrix.raw = matrix # for getting gene names as output for sc.tl.rank_genes_groups | ||
filtered_matrix = filter_matrix(matrix, annotation_column) | ||
sc.tl.rank_genes_groups(filtered_matrix, groupby=annotation_column, n_genes=10) | ||
ct_marker_genes_df = format_marker_genes_df( | ||
pd.DataFrame(filtered_matrix.uns["rank_genes_groups"]["names"]), | ||
annotation_column, | ||
) | ||
|
||
ct_marker_genes_df[gene_expr_column] = ct_marker_genes_df.apply( | ||
lambda row: get_marker_genes_with_expr( | ||
matrix, annotation_column, row[annotation_column], row["marker_genes"] | ||
), | ||
axis=1, | ||
) | ||
merged_obs = matrix.obs.merge( | ||
ct_marker_genes_df[[annotation_column, gene_expr_column]], how="left" | ||
) | ||
merged_obs[gene_expr_column] = merged_obs[gene_expr_column].astype(str) | ||
merged_obs.index = matrix.obs.index | ||
matrix.obs = merged_obs | ||
return matrix | ||
|
||
|
||
def main(args: argparse.Namespace): | ||
matrix = get_gene_expr(args.matrix, args.annotation_column, args.gene_expr_column) | ||
matrix.write_h5ad(args.output_matrix) | ||
|
||
|
||
def _get_arg_parser() -> argparse.ArgumentParser: | ||
parser = argparse.ArgumentParser(description="Add gene expressions to h5ad data") | ||
parser.add_argument("matrix", type=anndata.read_h5ad, help="h5ad data file") | ||
parser.add_argument( | ||
"--annotation-column", required=True, help="Column with annotations" | ||
) | ||
parser.add_argument( | ||
"--gene-expr-column", default="gene_expr", help="Column for gene_expr" | ||
) | ||
parser.add_argument( | ||
"--output-matrix", | ||
type=Path, | ||
default="matrix_with_gene_expr.h5ad", | ||
help="matrix with gene expressions output path", | ||
) | ||
|
||
return parser | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = _get_arg_parser() | ||
args = parser.parse_args() | ||
main(args) |
35 changes: 35 additions & 0 deletions
35
containers/gene-expression/context/requirements-freeze.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
anndata==0.10.2 | ||
array-api-compat==1.4 | ||
contourpy==1.1.1 | ||
cycler==0.12.1 | ||
fonttools==4.43.1 | ||
h5py==3.10.0 | ||
joblib==1.3.2 | ||
kiwisolver==1.4.5 | ||
llvmlite==0.41.1 | ||
matplotlib==3.8.0 | ||
natsort==8.4.0 | ||
networkx==3.2 | ||
numba==0.58.1 | ||
numpy==1.26.1 | ||
packaging==23.2 | ||
pandas==2.1.1 | ||
pathlib==1.0.1 | ||
patsy==0.5.3 | ||
Pillow==10.1.0 | ||
pynndescent==0.5.10 | ||
pyparsing==3.1.1 | ||
python-dateutil==2.8.2 | ||
pytz==2023.3.post1 | ||
scanpy==1.9.5 | ||
scikit-learn==1.3.2 | ||
scipy==1.11.3 | ||
seaborn==0.13.0 | ||
session-info==1.0.0 | ||
six==1.16.0 | ||
statsmodels==0.14.0 | ||
stdlib-list==0.9.0 | ||
threadpoolctl==3.2.0 | ||
tqdm==4.66.1 | ||
tzdata==2023.3 | ||
umap-learn==0.5.4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
anndata | ||
numpy | ||
scanpy | ||
pandas |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
type: record | ||
name: options | ||
label: Gene expression options | ||
fields: | ||
annotationColumn: | ||
type: string | ||
label: Annotation column | ||
inputBinding: | ||
prefix: --annotation-column | ||
geneExprColumn: | ||
type: string | ||
label: Gene expression column | ||
inputBinding: | ||
prefix: --gene-expr-column |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/env cwl-runner | ||
class: CommandLineTool | ||
cwlVersion: v1.0 | ||
|
||
requirements: | ||
DockerRequirement: | ||
dockerPull: ghcr.io/hubmapconsortium/hra-workflows/gene-expression:main | ||
SchemaDefRequirement: | ||
types: | ||
- $import: ./options.yml | ||
|
||
baseCommand: python | ||
arguments: | ||
- /main.py | ||
|
||
inputs: | ||
matrix: | ||
type: File | ||
label: Data to get gene expression for in h5ad format | ||
inputBinding: | ||
position: 0 | ||
options: ./options.yml#options | ||
|
||
outputs: | ||
matrix_with_gene_expr: | ||
type: File | ||
outputBinding: | ||
glob: matrix_with_gene_expr.h5ad |