Fix tests for similarity metrics

openproblems-bio · Sep 22, 2024 · 2998666 · 2998666
1 parent 10f7e50
commit 2998666
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 64 deletions.
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
diff --git a/src/api/comp_metric_quality.yaml b/src/api/comp_metric_quality.yaml
@@ -0,0 +1,34 @@
+namespace: metrics
+info:
+  type: metric
+  subtype: quality metric
+  type_info:
+    label: Quality Metric
+    summary: A metric for evaluating the quality of the processed iST data
+    description: |
+      This metric assesses the quality of the processed iST data.
+arguments:
+  - name: --input
+    __merge__: file_spatial_corrected_counts.yaml
+    required: true
+    direction: input
+  - name: --input_qc_col
+    __merge__: file_spatial_qc_col.yaml
+    direction: input
+    required: true
+  - name: --input_transcript_assignments
+    __merge__: file_transcript_assignments.yaml
+    direction: input
+    required: true
+  - name: "--score"
+    __merge__: file_score.yaml
+    direction: output
+    required: true
+
+test_resources:
+  - path: /resources_test/task_ist_preprocessing/mouse_brain_combined
+    dest: resources_test/task_ist_preprocessing/mouse_brain_combined
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - type: python_script
+    path: /common/component_tests/check_config.py
diff --git a/src/api/comp_metric_similarity.yaml b/src/api/comp_metric_similarity.yaml
@@ -24,11 +24,13 @@ arguments:
     required: true
     direction: output
     __merge__: file_score.yaml
-# test_resources:
-#   - path: /resources_test/common/pancreas
-#     dest: resources_test/common/pancreas
-#   - type: python_script
-#     path: /common/component_tests/run_and_check_output.py
 
+test_resources:
+  - path: /resources_test/task_ist_preprocessing/mouse_brain_combined
+    dest: resources_test/task_ist_preprocessing/mouse_brain_combined
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - type: python_script
+    path: /common/component_tests/check_config.py
 
 
diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml
@@ -6,14 +6,6 @@ info:
   format:
     type: h5ad
     uns:
-      - type: string
-        name: dataset_id
-        description: "A unique identifier for the dataset"
-        required: true
-      - type: string
-        name: method_id
-        description: "A unique identifier for the method"
-        required: true
       - type: string
         name: metric_ids
         description: "One or more unique metric identifiers"
@@ -24,3 +16,11 @@ info:
         description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
         multiple: true
         required: true
+      # - type: string
+      #   name: dataset_id
+      #   description: "A unique identifier for the dataset"
+      #   required: true
+      # - type: string
+      #   name: method_id
+      #   description: "A unique identifier for the method"
+      #   required: true
diff --git a/src/metrics/similarity/config.vsh.yaml b/src/metrics/similarity/config.vsh.yaml
@@ -14,34 +14,78 @@ name: similarity_metrics
 # Metadata for your component
 info:
   metrics:
-    # A unique identifier for your metric (required).
-    # Can contain only lowercase letters or underscores.
     - name: negative_marker_purity_reads
-      # A relatively short label, used when rendering visualisarions (required)
       label: Negative Marker Purity (Reads)
-      # A one sentence summary of how this metric works (required). Used when 
-      # rendering summary tables.
       summary: "The percentage of negative marker reads assigned to the correct cell types."
-      # A multi-line description of how this component works (required). Used
-      # when rendering reference documentation.
       description: |
         The percentage of negative marker reads assigned to the correct cell types.
       # A reference key from the bibtex library at src/common/library.bib (required).
       references:
-        doi: None
-      # The minimum possible value for this metric (required)
+        doi: "10.1101/2023.02.13.528102"
       min: 0
-      # The maximum possible value for this metric (required)
       max: 1
-      # Whether a higher value represents a 'better' solution (required)
       maximize: true
+    - name: negative_marker_purity_cells
+      label: Negative Marker Purity (Cells)
+      summary: "The percentage of cells that do not contain counts of negative markers of their specific cell type."
+      description: |
+        The percentage of cells that do not contain counts of negative markers of their specific cell type.
+      references:
+        doi: "10.1101/2023.02.13.528102"
+      min: 0
+      max: 1
+      maximize: true
+    - name: coexpr_similarity
+      label: Co-expression Similarity
+      summary: "The similarity between the co-expression patterns of spatial and scRNA-seq data."
+      description: |
+        The similarity is calculated as the absolute difference between the correlation matrices of spatial and 
+        scRNA-seq data. The correlation matrices contain pair-wise correlations between all genes in the dataset.
+      references:
+        doi: "10.1101/2023.02.13.528102"
+      min: 0
+      max: 1
+      maximize: true
+    - name: coexpr_similarity_celltype
+      label: Co-expression Similarity (Cell Type)
+      summary: "The similarity between the within cell type co-expression patterns of spatial and scRNA-seq data."
+      description: |
+        The similarity is calculated as the absolute difference between the correlation matrices of spatial and 
+        scRNA-seq data for each cell type. The final score is the mean over cell types. The correlation matrices contain
+        pair-wise correlations between all genes in the dataset.
+      references:
+        doi: "10.1101/2023.02.13.528102"
+      min: 0
+      max: 1
+    - name: rel_pairwise_ct_expr_sim
+      label: Relative Pairwise Cell Type Expression Similarity
+      summary: "Similarity of the mean expression difference between cell type pairs between spatial and scRNA-seq data."
+      description: |
+        todo
+      references:
+        doi: "10.1101/2023.02.13.528102"
+      min: 0
+      max: 1
+    - name: rel_pairwise_gene_expr_sim
+      label: Relative Pairwise Gene Expression Similarity
+      summary: "Similarity of the mean expression difference between gene pairs between spatial and scRNA-seq data."
+      description: |
+        todo
+      references:
+        doi: "10.1101/2023.02.13.528102"
+      min: 0
+      max: 1
+    - name: knn_mixing
+      label: KNN Mixing
+      summary: "Measure of the modality mixing within the joint knn graph of spatial and scRNA-seq data."
+      description: |
+        todo
+      references:
+        doi: "10.1101/2023.02.13.528102"
+      min: 0
+      max: 1
+
 
-# Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
 
 # Resources required to run the component
 resources:

diff --git a/src/metrics/similarity/script.py b/src/metrics/similarity/script.py
@@ -8,8 +8,8 @@
 # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
     'input': "resources_test/task_ist_preprocessing/mouse_brain_combined/corrected_counts.h5ad",
-    'input_sc': "resources_test/task_ist_preprocessing/mouse_brain_combined/normalised_counts.h5ad",
-    'input_qc_col': "resources_test/task_ist_preprocessing/mouse_brain_combined/qc_col.h5ad",
+    'input_sc': "resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad",
+    'input_qc_col': "resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_qc_col.h5ad",
     'output': "metrics.h5ad",
 }
 meta = {
@@ -22,6 +22,8 @@
 adata_sp_QC_obs_col = ad.read_h5ad(par['input_qc_col'])
 adata_sp.obs['passed_QC'] = adata_sp_QC_obs_col.obs['passed_QC']
 adata_sc = ad.read_h5ad(par['input_sc'])
+adata_sp.X = adata_sp.layers['normalized'] # TODO: ideally we don't do this, but some txsim functions seem to still expect .X (e.g. coexpression_similarity), fix this within txsim. 
+adata_sc.X = adata_sc.layers['normalized'] # TODO: same for scRNAseq data
 
 # There should be at least two cell types overlapping between scRNAseq and spatial data
 cts_sc = adata_sc.obs['cell_type'].dtype.categories
@@ -36,8 +38,10 @@
 
 
 print('Compute metrics', flush=True)
-df_filtered = tx.metrics.all_metrics(adata_sp[adata_sp.obs['passed_QC']], adata_sc, key="cell_type")
-df = tx.metrics.all_metrics(adata_sp, adata_sc, key="cell_type")
+df_filtered = tx.metrics.all_metrics(
+    adata_sp[adata_sp.obs['passed_QC']], adata_sc, key="cell_type", raw_layer="counts", lognorm_layer="normalized"
+)
+df = tx.metrics.all_metrics(adata_sp, adata_sc, key="cell_type", raw_layer="counts", lognorm_layer="normalized")
 
 uns_metric_ids = df.index.to_list() + [f"{metric}_qc_filtered" for metric in df_filtered.index]
 uns_metric_values = np.concatenate([df.values, df_filtered.values])