Skip to content

Commit

Permalink
Improve crosswalking and algorithm metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
axdanbol committed Feb 9, 2024
1 parent cad838c commit b1bac58
Show file tree
Hide file tree
Showing 20 changed files with 409 additions and 281 deletions.
9 changes: 0 additions & 9 deletions containers/azimuth/context/annotation-levels.json

This file was deleted.

4 changes: 2 additions & 2 deletions containers/azimuth/context/download-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
set -e

OUTPUT_DIR=${1:-"./azimuth"}
MAPPING_FILE=${2:-"/organ-mapping.json"}
METADATA_FILE=${2:-"/organ-metadata.json"}
export R_LIBS="$OUTPUT_DIR"

mkdir -p "$OUTPUT_DIR"
Rscript /download_reference_data.R "$MAPPING_FILE" "$OUTPUT_DIR"
Rscript /download_reference_data.R "$METADATA_FILE" "$OUTPUT_DIR"
6 changes: 3 additions & 3 deletions containers/azimuth/context/download_reference_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ library(Seurat)
library(SeuratData)

args <- commandArgs(trailingOnly = TRUE)
organ_mapping_file <- args[1]
organ_metadata_file <- args[1]
output_dir <- args[2]

# Load unique reference organs
mapping <- fromJSON(file = organ_mapping_file)
references <- unlist(unique(mapping))
metadata <- fromJSON(file = organ_metadata_file)
references <- unique(sapply(metadata, function(item) item$model))

# Download and install data
options(timeout=60 * 60) # Probably overkill but the default of 60s is to low for some of the datasets
Expand Down
65 changes: 28 additions & 37 deletions containers/azimuth/context/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import logging
import subprocess
import typing as t
Expand All @@ -7,24 +6,31 @@
import anndata
import pandas

from src.algorithm import Algorithm, OrganLookup, add_common_arguments
from src.algorithm import Algorithm, RunResult, add_common_arguments


class AzimuthOrganMetadata(t.TypedDict):
model: str
organ_level: str
prediction_column: str


class AzimuthOptions(t.TypedDict):
reference_data_dir: Path
annotation_levels: Path


class AzimuthAlgorithm(Algorithm[str, AzimuthOptions]):
def __init__(self):
super().__init__(OrganLookup)

def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
class AzimuthAlgorithm(Algorithm[AzimuthOrganMetadata, AzimuthOptions]):
def do_run(
self,
matrix: Path,
organ: str,
metadata: AzimuthOrganMetadata,
options: AzimuthOptions,
) -> RunResult:
"""Annotate data using azimuth."""
data = anndata.read_h5ad(matrix)
reference_data = self.find_reference_data(organ, options["reference_data_dir"])
annotation_level = self.find_annotation_level(
organ, options["annotation_levels"]
reference_data = self.find_reference_data(
organ, metadata["model"], options["reference_data_dir"]
)

# Azimuth chokes when trying to load matrices that has
Expand All @@ -41,7 +47,11 @@ def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
annotated_matrix = anndata.read_h5ad(annotated_matrix_path)
self.copy_annotations(data, annotated_matrix)

return data, annotation_level
return {
"data": data,
"organ_level": metadata["organ_level"],
"prediction_column": "predicted." + metadata["prediction_column"],
}

def create_clean_matrix(self, matrix: anndata.AnnData) -> anndata.AnnData:
"""Creates a copy of the data with all observation columns removed.
Expand Down Expand Up @@ -73,7 +83,7 @@ def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str:
Args:
matrix_path (Path): Path to data file
reference_data (Path): Path to organ reference data directory
reference_data (Path): Path to model reference data directory
Returns:
str: Path to the output data file
Expand All @@ -82,11 +92,12 @@ def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str:
subprocess.run(script_command, capture_output=True, check=True, text=True)
return "./result.h5ad"

def find_reference_data(self, organ: str, dir: Path) -> Path:
"""Finds the reference data directory for an organ.
def find_reference_data(self, organ: str, model: str, dir: Path) -> Path:
"""Finds the reference data directory for a model.
Args:
organ (str): Organ name
organ (str): Organ id
model (str): Model name
dir (Path): Directory to search
Raises:
Expand All @@ -97,7 +108,7 @@ def find_reference_data(self, organ: str, dir: Path) -> Path:
"""

def is_reference_data_candidate(path: Path):
return path.is_dir() and organ.lower() in path.name.lower()
return path.is_dir() and model.lower() in path.name.lower()

subdir = self._find_in_dir(
dir,
Expand All @@ -108,20 +119,6 @@ def is_reference_data_candidate(path: Path):
# idx.annoy and ref.Rds is always located inside an 'azimuth' subdirectory
return subdir / "azimuth"

def find_annotation_level(self, organ: str, path: Path) -> str:
"""Finds the column name which contains the predictions.
Args:
organ (str): Organ name
path (Path): Path to file containing information about column names
Returns:
str: Column name
"""
with open(path) as file:
levels_by_organ = json.load(file)
return "predicted." + levels_by_organ[organ]

def _find_in_dir(
self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str
) -> Path:
Expand Down Expand Up @@ -159,12 +156,6 @@ def _get_arg_parser():
required=True,
help="Path to directory with reference data",
)
parser.add_argument(
"--annotation-levels",
type=Path,
default="/annotation-levels.json",
help="Json file with organ to annotation levels",
)

return parser

Expand Down
14 changes: 0 additions & 14 deletions containers/azimuth/context/organ-mapping.json

This file was deleted.

62 changes: 62 additions & 0 deletions containers/azimuth/context/organ-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"UBERON:0002113": {
"model": "kidneyref",
"organ_level": "Kidney_L3",
"prediction_column": "annotation.l3"
},
"UBERON:0004538": {
"model": "kidneyref",
"organ_level": "Kidney_L3",
"prediction_column": "annotation.l3"
},
"UBERON:0004539": {
"model": "kidneyref",
"organ_level": "Kidney_L3",
"prediction_column": "annotation.l3"
},
"UBERON:0002048": {
"model": "lungref",
"organ_level": "Lung_v2_finest_level",
"prediction_column": "ann_finest_level"
},
"UBERON:0001004": {
"model": "lungref",
"organ_level": "Lung_v2_finest_level",
"prediction_column": "ann_finest_level"
},
"UBERON:0000948": {
"model": "heartref",
"organ_level": "Heart_L2",
"prediction_column": "celltype.l2"
},
"UBERON:0000955": {
"model": "humancortexref",
"organ_level": "",
"prediction_column": "subclass"
},
"UBERON:0001264": {
"model": "pancreasref",
"organ_level": "Pancreas_L1",
"prediction_column": "annotation.l1"
},
"UBERON:0002373": {
"model": "tonsilref",
"organ_level": "",
"prediction_column": "celltype.l2"
},
"UBERON:0000178": {
"model": "pbmcref",
"organ_level": "Human_PBMC_L2",
"prediction_column": "celltype.l2"
},
"UBERON:0002371": {
"model": "bonemarrowref",
"organ_level": "Bone_marrow_L2",
"prediction_column": "celltype.l2"
},
"UBERON:0001013": {
"model": "adiposeref",
"organ_level": "",
"prediction_column": "celltype.l2"
}
}
41 changes: 20 additions & 21 deletions containers/celltypist/context/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,38 @@
import pandas
import scanpy

from src.algorithm import Algorithm, OrganLookup, add_common_arguments
from src.algorithm import Algorithm, RunResult, add_common_arguments


class CelltypistOptions(t.TypedDict):
ensemble_lookup: Path

class CelltypistOrganMetadata(t.TypedDict):
model: str

class CelltypistOrganLookup(OrganLookup[celltypist.Model]):
def __init__(self, mapping_file: Path):
super().__init__(mapping_file)

def get_builtin_options(self):
"""Get builtin celltypist models."""
models = celltypist.models.get_all_models()
return map(lambda model: (model, self.from_raw(model)), models)

def from_raw(self, id: str):
"""Load a celltypist model."""
return celltypist.models.Model.load(id)
class CelltypistOptions(t.TypedDict):
ensemble_lookup: Path


class CelltypistAlgorithm(Algorithm[celltypist.Model, CelltypistOptions]):
class CelltypistAlgorithm(Algorithm[CelltypistOrganMetadata, CelltypistOptions]):
def __init__(self):
super().__init__(CelltypistOrganLookup, "predicted_labels")

def do_run(self, matrix: Path, organ: celltypist.Model, options: CelltypistOptions):
super().__init__("predicted_labels")

def do_run(
self,
matrix: Path,
organ: str,
metadata: CelltypistOrganMetadata,
options: CelltypistOptions,
) -> RunResult:
"""Annotate data using celltypist."""
data = scanpy.read_h5ad(matrix)
data = self.normalize(data)
data, var_names = self.normalize_var_names(data, options)
data = celltypist.annotate(data, organ, majority_voting=True).to_adata()
data = celltypist.annotate(
data, metadata["model"], majority_voting=True
).to_adata()
data.var_names = t.cast(t.Any, var_names)
return data

return {"data": data, "organ_level": metadata["model"].replace(".", "_")}

def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData:
"""Normalizes data according to celltypist requirements.
Expand Down
9 changes: 0 additions & 9 deletions containers/celltypist/context/organ-mapping.json

This file was deleted.

23 changes: 23 additions & 0 deletions containers/celltypist/context/organ-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"UBERON:0002048": {
"model": "Human_Lung_Atlas.pkl"
},
"UBERON:0001004": {
"model": "Human_Lung_Atlas.pkl"
},
"UBERON:0002097": {
"model": "Adult_Human_Skin.pkl"
},
"UBERON:0001264": {
"model": "Adult_Human_PancreaticIslet.pkl"
},
"UBERON:0000948": {
"model": "Healthy_Adult_Heart.pkl"
},
"UBERON:0002107": {
"model": "Healthy_Human_Liver.pkl"
},
"UBERON:0000955": {
"model": "Human_AdultAged_Hippocampus.pkl"
}
}
Loading

0 comments on commit b1bac58

Please sign in to comment.