Improve crosswalking and algorithm metadata

hubmapconsortium · Feb 9, 2024 · b1bac58 · b1bac58
1 parent cad838c
commit b1bac58
Show file tree

Hide file tree

Showing 20 changed files with 409 additions and 281 deletions.
diff --git a/containers/azimuth/context/annotation-levels.json b/containers/azimuth/context/annotation-levels.json
diff --git a/containers/azimuth/context/download-data.sh b/containers/azimuth/context/download-data.sh
@@ -2,8 +2,8 @@
 set -e
 
 OUTPUT_DIR=${1:-"./azimuth"}
-MAPPING_FILE=${2:-"/organ-mapping.json"}
+METADATA_FILE=${2:-"/organ-metadata.json"}
 export R_LIBS="$OUTPUT_DIR"
 
 mkdir -p "$OUTPUT_DIR"
-Rscript /download_reference_data.R "$MAPPING_FILE" "$OUTPUT_DIR"
+Rscript /download_reference_data.R "$METADATA_FILE" "$OUTPUT_DIR"
diff --git a/containers/azimuth/context/download_reference_data.R b/containers/azimuth/context/download_reference_data.R
@@ -3,12 +3,12 @@ library(Seurat)
 library(SeuratData)
 
 args <- commandArgs(trailingOnly = TRUE)
-organ_mapping_file <- args[1]
+organ_metadata_file <- args[1]
 output_dir <- args[2]
 
 # Load unique reference organs
-mapping <- fromJSON(file = organ_mapping_file)
-references <- unlist(unique(mapping))
+metadata <- fromJSON(file = organ_metadata_file)
+references <- unique(sapply(metadata, function(item) item$model))
 
 # Download and install data
 options(timeout=60 * 60) # Probably overkill but the default of 60s is to low for some of the datasets

diff --git a/containers/azimuth/context/main.py b/containers/azimuth/context/main.py
@@ -1,4 +1,3 @@
-import json
 import logging
 import subprocess
 import typing as t
@@ -7,24 +6,31 @@
 import anndata
 import pandas
 
-from src.algorithm import Algorithm, OrganLookup, add_common_arguments
+from src.algorithm import Algorithm, RunResult, add_common_arguments
+
+
+class AzimuthOrganMetadata(t.TypedDict):
+    model: str
+    organ_level: str
+    prediction_column: str
 
 
 class AzimuthOptions(t.TypedDict):
     reference_data_dir: Path
-    annotation_levels: Path
-
 
-class AzimuthAlgorithm(Algorithm[str, AzimuthOptions]):
-    def __init__(self):
-        super().__init__(OrganLookup)
 
-    def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
+class AzimuthAlgorithm(Algorithm[AzimuthOrganMetadata, AzimuthOptions]):
+    def do_run(
+        self,
+        matrix: Path,
+        organ: str,
+        metadata: AzimuthOrganMetadata,
+        options: AzimuthOptions,
+    ) -> RunResult:
         """Annotate data using azimuth."""
         data = anndata.read_h5ad(matrix)
-        reference_data = self.find_reference_data(organ, options["reference_data_dir"])
-        annotation_level = self.find_annotation_level(
-            organ, options["annotation_levels"]
+        reference_data = self.find_reference_data(
+            organ, metadata["model"], options["reference_data_dir"]
         )
 
         # Azimuth chokes when trying to load matrices that has
@@ -41,7 +47,11 @@ def do_run(self, matrix: Path, organ: str, options: AzimuthOptions):
         annotated_matrix = anndata.read_h5ad(annotated_matrix_path)
         self.copy_annotations(data, annotated_matrix)
 
-        return data, annotation_level
+        return {
+            "data": data,
+            "organ_level": metadata["organ_level"],
+            "prediction_column": "predicted." + metadata["prediction_column"],
+        }
 
     def create_clean_matrix(self, matrix: anndata.AnnData) -> anndata.AnnData:
         """Creates a copy of the data with all observation columns removed.
@@ -73,7 +83,7 @@ def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str:
 
         Args:
             matrix_path (Path): Path to data file
-            reference_data (Path): Path to organ reference data directory
+            reference_data (Path): Path to model reference data directory
 
         Returns:
             str: Path to the output data file
@@ -82,11 +92,12 @@ def run_azimuth_scripts(self, matrix_path: Path, reference_data: Path) -> str:
         subprocess.run(script_command, capture_output=True, check=True, text=True)
         return "./result.h5ad"
 
-    def find_reference_data(self, organ: str, dir: Path) -> Path:
-        """Finds the reference data directory for an organ.
+    def find_reference_data(self, organ: str, model: str, dir: Path) -> Path:
+        """Finds the reference data directory for a model.
 
         Args:
-            organ (str): Organ name
+            organ (str): Organ id
+            model (str): Model name
             dir (Path): Directory to search
 
         Raises:
@@ -97,7 +108,7 @@ def find_reference_data(self, organ: str, dir: Path) -> Path:
         """
 
         def is_reference_data_candidate(path: Path):
-            return path.is_dir() and organ.lower() in path.name.lower()
+            return path.is_dir() and model.lower() in path.name.lower()
 
         subdir = self._find_in_dir(
             dir,
@@ -108,20 +119,6 @@ def is_reference_data_candidate(path: Path):
         # idx.annoy and ref.Rds is always located inside an 'azimuth' subdirectory
         return subdir / "azimuth"
 
-    def find_annotation_level(self, organ: str, path: Path) -> str:
-        """Finds the column name which contains the predictions.
-
-        Args:
-            organ (str): Organ name
-            path (Path): Path to file containing information about column names
-
-        Returns:
-            str: Column name
-        """
-        with open(path) as file:
-            levels_by_organ = json.load(file)
-        return "predicted." + levels_by_organ[organ]
-
     def _find_in_dir(
         self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str
     ) -> Path:
@@ -159,12 +156,6 @@ def _get_arg_parser():
         required=True,
         help="Path to directory with reference data",
     )
-    parser.add_argument(
-        "--annotation-levels",
-        type=Path,
-        default="/annotation-levels.json",
-        help="Json file with organ to annotation levels",
-    )
 
     return parser
 

diff --git a/containers/azimuth/context/organ-mapping.json b/containers/azimuth/context/organ-mapping.json
diff --git a/containers/azimuth/context/organ-metadata.json b/containers/azimuth/context/organ-metadata.json
@@ -0,0 +1,62 @@
+{
+  "UBERON:0002113": {
+    "model": "kidneyref",
+    "organ_level": "Kidney_L3",
+    "prediction_column": "annotation.l3"
+  },
+  "UBERON:0004538": {
+    "model": "kidneyref",
+    "organ_level": "Kidney_L3",
+    "prediction_column": "annotation.l3"
+  },
+  "UBERON:0004539": {
+    "model": "kidneyref",
+    "organ_level": "Kidney_L3",
+    "prediction_column": "annotation.l3"
+  },
+  "UBERON:0002048": {
+    "model": "lungref",
+    "organ_level": "Lung_v2_finest_level",
+    "prediction_column": "ann_finest_level"
+  },
+  "UBERON:0001004": {
+    "model": "lungref",
+    "organ_level": "Lung_v2_finest_level",
+    "prediction_column": "ann_finest_level"
+  },
+  "UBERON:0000948": {
+    "model": "heartref",
+    "organ_level": "Heart_L2",
+    "prediction_column": "celltype.l2"
+  },
+  "UBERON:0000955": {
+    "model": "humancortexref",
+    "organ_level": "",
+    "prediction_column": "subclass"
+  },
+  "UBERON:0001264": {
+    "model": "pancreasref",
+    "organ_level": "Pancreas_L1",
+    "prediction_column": "annotation.l1"
+  },
+  "UBERON:0002373": {
+    "model": "tonsilref",
+    "organ_level": "",
+    "prediction_column": "celltype.l2"
+  },
+  "UBERON:0000178": {
+    "model": "pbmcref",
+    "organ_level": "Human_PBMC_L2",
+    "prediction_column": "celltype.l2"
+  },
+  "UBERON:0002371": {
+    "model": "bonemarrowref",
+    "organ_level": "Bone_marrow_L2",
+    "prediction_column": "celltype.l2"
+  },
+  "UBERON:0001013": {
+    "model": "adiposeref",
+    "organ_level": "",
+    "prediction_column": "celltype.l2"
+  }
+}
diff --git a/containers/celltypist/context/main.py b/containers/celltypist/context/main.py
@@ -7,39 +7,38 @@
 import pandas
 import scanpy
 
-from src.algorithm import Algorithm, OrganLookup, add_common_arguments
+from src.algorithm import Algorithm, RunResult, add_common_arguments
 
 
-class CelltypistOptions(t.TypedDict):
-    ensemble_lookup: Path
-
+class CelltypistOrganMetadata(t.TypedDict):
+    model: str
 
-class CelltypistOrganLookup(OrganLookup[celltypist.Model]):
-    def __init__(self, mapping_file: Path):
-        super().__init__(mapping_file)
 
-    def get_builtin_options(self):
-        """Get builtin celltypist models."""
-        models = celltypist.models.get_all_models()
-        return map(lambda model: (model, self.from_raw(model)), models)
-
-    def from_raw(self, id: str):
-        """Load a celltypist model."""
-        return celltypist.models.Model.load(id)
+class CelltypistOptions(t.TypedDict):
+    ensemble_lookup: Path
 
 
-class CelltypistAlgorithm(Algorithm[celltypist.Model, CelltypistOptions]):
+class CelltypistAlgorithm(Algorithm[CelltypistOrganMetadata, CelltypistOptions]):
     def __init__(self):
-        super().__init__(CelltypistOrganLookup, "predicted_labels")
-
-    def do_run(self, matrix: Path, organ: celltypist.Model, options: CelltypistOptions):
+        super().__init__("predicted_labels")
+
+    def do_run(
+        self,
+        matrix: Path,
+        organ: str,
+        metadata: CelltypistOrganMetadata,
+        options: CelltypistOptions,
+    ) -> RunResult:
         """Annotate data using celltypist."""
         data = scanpy.read_h5ad(matrix)
         data = self.normalize(data)
         data, var_names = self.normalize_var_names(data, options)
-        data = celltypist.annotate(data, organ, majority_voting=True).to_adata()
+        data = celltypist.annotate(
+            data, metadata["model"], majority_voting=True
+        ).to_adata()
         data.var_names = t.cast(t.Any, var_names)
-        return data
+
+        return {"data": data, "organ_level": metadata["model"].replace(".", "_")}
 
     def normalize(self, data: scanpy.AnnData) -> scanpy.AnnData:
         """Normalizes data according to celltypist requirements.

diff --git a/containers/celltypist/context/organ-mapping.json b/containers/celltypist/context/organ-mapping.json
diff --git a/containers/celltypist/context/organ-metadata.json b/containers/celltypist/context/organ-metadata.json
@@ -0,0 +1,23 @@
+{
+  "UBERON:0002048": {
+    "model": "Human_Lung_Atlas.pkl"
+  },
+  "UBERON:0001004": {
+    "model": "Human_Lung_Atlas.pkl"
+  },
+  "UBERON:0002097": {
+    "model": "Adult_Human_Skin.pkl"
+  },
+  "UBERON:0001264": {
+    "model": "Adult_Human_PancreaticIslet.pkl"
+  },
+  "UBERON:0000948": {
+    "model": "Healthy_Adult_Heart.pkl"
+  },
+  "UBERON:0002107": {
+    "model": "Healthy_Human_Liver.pkl"
+  },
+  "UBERON:0000955": {
+    "model": "Human_AdultAged_Hippocampus.pkl"
+  }
+}