Merge pull request #7 from openproblems-bio/add_pciseq

Add pciseq
openproblems-bio · Dec 23, 2024 · 778f2b8 · 778f2b8
2 parents 7a0d863 + 1dfd101
commit 778f2b8
Show file tree

Hide file tree

Showing 6 changed files with 331 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -18,4 +18,6 @@ singularity_container/
 /resources
 /.vscode
 /.nextflow*
-/work
+/work
+
+.DS_STORE
diff --git a/src/api/comp_method_transcript_assignment.yaml b/src/api/comp_method_transcript_assignment.yaml
@@ -19,6 +19,11 @@ arguments:
     required: false
     direction: input
     __merge__: file_scrnaseq_reference.yaml
+  - name: "--sc_cell_type_key"
+    type: string
+    required: false
+    direction: input
+    default: cell_type 
   - name: "--output"
     __merge__: file_transcript_assignments.yaml
     direction: output

diff --git a/src/api/file_transcript_assignments.yaml b/src/api/file_transcript_assignments.yaml
@@ -50,4 +50,4 @@ info:
           - type: string
             name: region
             description: Region
-            required: true
+            required: false
diff --git a/src/methods_transcript_assignment/basic_transcript_assignment/script.py b/src/methods_transcript_assignment/basic_transcript_assignment/script.py
@@ -2,6 +2,7 @@
 import dask
 import spatialdata as sd
 import anndata as ad
+import pandas as pd
 import os
 import shutil
 
@@ -11,7 +12,7 @@
   'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
   'transcripts_key': 'transcripts',
   'coordinate_system': 'global',
-  'output': 'assigned_transcripts.zarr',
+  'output': 'basic_assigned_transcripts.zarr',
 }
 meta = {
   'name': 'basic'
@@ -49,6 +50,17 @@
 )
 sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series 
 
+#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
+unique_cells = np.unique(cell_id_dask_series)
+
+# check if a '0' (noise/background) cell is in cell_id and remove
+zero_idx = np.where(unique_cells == 0)
+if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])
+
+#transform into pandas series and check
+cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
+assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"
+
 # TODO: Also take care of the following cases:
 # - segmentation 3D, transcripts 3D
 # - segmentation 3D, transcripts 2D
@@ -61,7 +73,7 @@
   },
   tables={
     "table": ad.AnnData(
-      obs=sdata.tables["table"].obs[["cell_id", "region"]],
+      obs=pd.DataFrame(cell_id_col),
       var=sdata.tables["table"].var[[]]
     )
   }

diff --git a/src/methods_transcript_assignment/pciSeq_transcript_assignment/config.vsh.yaml b/src/methods_transcript_assignment/pciSeq_transcript_assignment/config.vsh.yaml
@@ -0,0 +1,165 @@
+__merge__: /src/api/comp_method_transcript_assignment.yaml
+
+name: pciseq_transcript_assignment
+label: "pciSeq Transcript Assignment"
+summary: "Assign transcripts to cells using the pciSeq method from Qian et. al. (2020)"
+description: "Uses a reference sc-RNAseq dataset to probabalistically assign cell types and transcripts to cells ."
+links:
+  documentation: "https://github.com/acycliq/pciSeq"
+  repository: "https://github.com/acycliq/pciSeq"
+references:
+  doi: "10.1038/s41592-019-0631-4"
+
+arguments:
+  - name: --transcripts_key
+    type: string
+    description: The key of the transcripts within the points of the spatial data
+    default: transcripts
+  - name: --coordinate_system
+    type: string
+    description: The key of the pixel space coordinate system within the spatial data
+    default: global
+  # - name: --sc_cell_type_key
+  #   type: string
+  #   default: cell_type
+  #   required: true
+  #   direction: input
+  #   description: The name of column in the SC-RNAseq AnnData .obs with the cell type of each cell
+
+  # - name: --exclude_genes
+  #   type: string
+  #   required: false
+  #   description: "list of genes to be excluded during cell-typing, e.g ['Aldoc', 'Id2'] to exclude all spots from Aldoc and Id2"
+  #   direction: input 
+  #   default: None
+
+  - name: --max_iter
+    type: integer
+    required: false
+    description: "Maximum number of loops allowed for the Variational Bayes to run"
+    direction: input 
+    default: 1000
+
+  - name: --CellCallTolerance
+    type: double
+    required: false
+    description: "Convergence achieved if assignment probabilities between two successive loops is less than the tolerance"
+    direction: input 
+    default: 0.02
+
+  - name: --rGene
+    type: double
+    required: false
+    description: |
+      "A gamma distribution expresses the efficiency of the in-situ sequencing for each gene. It tries to capture
+      the ratio of the observed over the theoretical counts for a given gene. rGene controls the variance and
+      Inefficiency is the average of this assumed Gamma distribution"
+    direction: input 
+    default: 20
+
+  - name: --Inefficiency
+    type: double
+    required: false
+    description: " "
+    direction: input 
+    default: 0.2
+
+  - name: --InsideCellBonus
+    type: double
+    required: false
+    description: |
+      "If a spot is inside the cell boundaries this bonus will give the likelihood an extra boost
+      in order to make the spot more probable to get assigned to the cell than another spot positioned
+      outside the cell boundaries"
+    direction: input 
+    default: 2
+
+  - name: --MisreadDensity
+    type: double
+    required: false
+    description: |
+      "To account for spots far from the some a uniform distribution is introduced to describe those misreads.
+      By default this uniform distribution has a density of 1e-5 misreads per pixel."
+    direction: input 
+    default: 0.00001
+
+  - name: --SpotReg
+    type: double
+    required: false
+    description: |
+      "Gene detection might come with irregularities due to technical errors. A small value is introduced
+      here to account for these errors. It is an additive factor, applied to the single cell expression
+      counts when the mean counts per class and per gene are calculated."
+    direction: input 
+    default: 0.1
+
+  - name: --nNeighbors
+    type: integer
+    required: false
+    description: |
+      "By default only the 3 nearest cells will be considered as possible parent cells for any given spot.
+      There is also one extra 'super-neighbor', which is always a neighbor to the spots so we can assign
+      the misreads to. Could be seen as the background. Hence, by default the algorithm tries examines
+      whether any of the 3 nearest cells is a possible parent cell to a given cell or whether the spot is
+      a misread"
+    direction: input 
+    default: 3
+
+  # 
+  - name: --rSpot
+    type: double
+    required: false
+    description: |
+      "A gamma distributed variate from Gamma(rSpot, 1) is applied to the mean expression, hence the counts
+      are distributed according to a Negative Binomial distribution.
+      The value for rSpot will control the variance/dispersion of the counts"
+    direction: input 
+    default: 2
+
+  - name: --save_data
+    type: boolean
+    required: false
+    description: "Boolean, if True the output will be saved as tsv files in a folder named 'pciSeq' in your system's temp dir."
+    direction: input 
+    default: False
+
+  # output directory 'default' will save to temp location
+  # - name: output_path
+  #   default: ['default']
+
+  # 
+  # - name: --dtype
+  #   type: string
+  #   required: false
+  #   description: |
+  #     "Use either np.float16 or np.float32 to reduce memory usage. In most cases RAM consumption shouldnt
+  #     need more than 32Gb RAM. If you have a dataset from a full coronal mouse slice with a high number of
+  #     segmented cells (around 150,000) a gene panel of more than 250 genes and 100 or more different
+  #     cell types (aka clusters, aka classes) in the single cell data then you might need at least 64GB on
+  #     your machine. Changing the datatype to a float16 or float32 will help keeping RAM usage to a lower
+  #     level"
+  #   direction: input 
+  #   default: np.float64
+
+
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+      - /src/base/setup_txsim_partial.yaml
+    setup:
+      - type: python
+        pypi: [pciseq]
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ midtime, midcpu, midmem ]
diff --git a/src/methods_transcript_assignment/pciSeq_transcript_assignment/script.py b/src/methods_transcript_assignment/pciSeq_transcript_assignment/script.py
@@ -0,0 +1,143 @@
+import numpy as np
+import dask
+import spatialdata as sd
+import txsim as tx
+import anndata as ad
+import os
+import shutil
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
+  'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
+  'transcripts_key': 'transcripts',
+  'coordinate_system': 'global',
+  'output': '../pciSeq_assigned_transcripts.zarr',
+
+  'input_scrnaseq': 'resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad',
+  'sc_cell_type_key': 'cell_type',
+
+  'exclude_genes': None,
+  'max_iter': 1000,
+  'CellCallTolerance': 0.02,
+  'rGene': 20,
+  'Inefficiency': 0.2,
+  'InsideCellBonus': 2,
+  'MisreadDensity': 0.00001,
+  'SpotReg': 0.1,
+  'nNeighbors': 3,
+  'rSpot': 2,
+  'save_data': False,
+  'dtype': np.float64
+}
+meta = {
+  'name': 'pciSeq_transcript_assignment'
+}
+## VIASH END
+
+# Read input
+print('Reading input files', flush=True)
+sdata = sd.read_zarr(par['input_ist'])
+sdata_segm = sd.read_zarr(par['input_segmentation'])
+
+# Check if coordinate system is available in input data
+transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
+assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
+assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
+
+# Transform transcript coordinates to the coordinate system
+print('Transforming transcripts coordinates', flush=True)
+transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
+
+# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
+trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
+transcripts = sd.transform(transcripts, trans, par['coordinate_system'])
+
+# Assign cell ids to transcripts
+print('Assigning transcripts to cell ids', flush=True)
+y_coords = transcripts.y.compute().to_numpy()
+x_coords = transcripts.x.compute().to_numpy()
+
+#Added for pciSeq
+#TODO this will immediately break when the name of the gene isn't feature_name
+transcripts_dataframe = sdata[par['transcripts_key']].compute()[['feature_name']] 
+transcripts_dataframe['x'] = x_coords
+transcripts_dataframe['y'] = y_coords
+
+#same as before
+label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() #TODO: mabye this line needs generalization (DataTree vs DataArray)
+
+# Grab all the pciSeq parameters
+opts_keys = [#'exclude_genes',
+  'max_iter',
+  'CellCallTolerance',
+  'rGene',
+  'Inefficiency',
+  'InsideCellBonus',
+  'MisreadDensity',
+  'SpotReg',
+  'nNeighbors',
+  'rSpot',
+  'save_data']
+
+opts = {k: par[k] for k in opts_keys}
+
+input_scrnaseq = ad.read_h5ad(par['input_scrnaseq'])
+input_scrnaseq.X = input_scrnaseq.layers['counts']
+
+assignments, cell_types = tx.preprocessing.run_pciSeq(
+    transcripts_dataframe,
+    label_image,
+    input_scrnaseq,
+    par['sc_cell_type_key'],
+    opts
+)
+
+#assign transcript -> cell
+cell_id_dask_series = dask.dataframe.from_dask_array(
+    dask.array.from_array(
+        assignments['cell'].to_numpy(), chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
+    ), 
+    index=sdata[par['transcripts_key']].index
+)
+
+sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series 
+
+# create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
+cell_types['type'] = cell_types['type'].replace({'None':'None_sp'})
+cell_types.insert(0, 'cell_id', cell_types.index)
+cell_types.rename(columns={'type':'cell_type','prob':'cell_type_prob'}, inplace=True)
+
+assert 0 not in cell_types['cell_id'], "Found '0' in cell_id column of assingment output cell matrix"
+
+output_table = ad.AnnData(
+      obs=cell_types[['cell_id','cell_type','cell_type_prob']],
+      var=sdata.tables["table"].var[[]]
+    )
+
+# TODO: Also take care of the following cases:
+# - segmentation 3D, transcripts 3D
+# - segmentation 3D, transcripts 2D
+# - segmentation 2D, transcripts 3D
+
+# Subset sdata to transcripts with cell ids
+
+print('Subsetting to transcripts cell id and cell type data', flush=True)
+sdata_transcripts_only = sd.SpatialData(
+  points={
+    "transcripts": sdata[par['transcripts_key']]
+  },
+  tables={
+    "table": output_table
+  }
+)
+
+print('Write transcripts with cell ids and cell types', flush=True)
+if os.path.exists(par["output"]):
+  shutil.rmtree(par["output"])
+sdata_transcripts_only.write(par['output'])
+
+