Skip to content

Commit

Permalink
Merge pull request #7 from openproblems-bio/add_pciseq
Browse files Browse the repository at this point in the history
Add pciseq
  • Loading branch information
LouisK92 authored Dec 23, 2024
2 parents 7a0d863 + 1dfd101 commit 778f2b8
Show file tree
Hide file tree
Showing 6 changed files with 331 additions and 4 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ singularity_container/
/resources
/.vscode
/.nextflow*
/work
/work

.DS_STORE
5 changes: 5 additions & 0 deletions src/api/comp_method_transcript_assignment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ arguments:
required: false
direction: input
__merge__: file_scrnaseq_reference.yaml
- name: "--sc_cell_type_key"
type: string
required: false
direction: input
default: cell_type
- name: "--output"
__merge__: file_transcript_assignments.yaml
direction: output
Expand Down
2 changes: 1 addition & 1 deletion src/api/file_transcript_assignments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ info:
- type: string
name: region
description: Region
required: true
required: false
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import dask
import spatialdata as sd
import anndata as ad
import pandas as pd
import os
import shutil

Expand All @@ -11,7 +12,7 @@
'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
'transcripts_key': 'transcripts',
'coordinate_system': 'global',
'output': 'assigned_transcripts.zarr',
'output': 'basic_assigned_transcripts.zarr',
}
meta = {
'name': 'basic'
Expand Down Expand Up @@ -49,6 +50,17 @@
)
sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series

#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
unique_cells = np.unique(cell_id_dask_series)

# check if a '0' (noise/background) cell is in cell_id and remove
zero_idx = np.where(unique_cells == 0)
if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])

#transform into pandas series and check
cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"

# TODO: Also take care of the following cases:
# - segmentation 3D, transcripts 3D
# - segmentation 3D, transcripts 2D
Expand All @@ -61,7 +73,7 @@
},
tables={
"table": ad.AnnData(
obs=sdata.tables["table"].obs[["cell_id", "region"]],
obs=pd.DataFrame(cell_id_col),
var=sdata.tables["table"].var[[]]
)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
__merge__: /src/api/comp_method_transcript_assignment.yaml

name: pciseq_transcript_assignment
label: "pciSeq Transcript Assignment"
summary: "Assign transcripts to cells using the pciSeq method from Qian et. al. (2020)"
description: "Uses a reference sc-RNAseq dataset to probabalistically assign cell types and transcripts to cells ."
links:
documentation: "https://github.com/acycliq/pciSeq"
repository: "https://github.com/acycliq/pciSeq"
references:
doi: "10.1038/s41592-019-0631-4"

arguments:
- name: --transcripts_key
type: string
description: The key of the transcripts within the points of the spatial data
default: transcripts
- name: --coordinate_system
type: string
description: The key of the pixel space coordinate system within the spatial data
default: global
# - name: --sc_cell_type_key
# type: string
# default: cell_type
# required: true
# direction: input
# description: The name of column in the SC-RNAseq AnnData .obs with the cell type of each cell

# - name: --exclude_genes
# type: string
# required: false
# description: "list of genes to be excluded during cell-typing, e.g ['Aldoc', 'Id2'] to exclude all spots from Aldoc and Id2"
# direction: input
# default: None

- name: --max_iter
type: integer
required: false
description: "Maximum number of loops allowed for the Variational Bayes to run"
direction: input
default: 1000

- name: --CellCallTolerance
type: double
required: false
description: "Convergence achieved if assignment probabilities between two successive loops is less than the tolerance"
direction: input
default: 0.02

- name: --rGene
type: double
required: false
description: |
"A gamma distribution expresses the efficiency of the in-situ sequencing for each gene. It tries to capture
the ratio of the observed over the theoretical counts for a given gene. rGene controls the variance and
Inefficiency is the average of this assumed Gamma distribution"
direction: input
default: 20

- name: --Inefficiency
type: double
required: false
description: " "
direction: input
default: 0.2

- name: --InsideCellBonus
type: double
required: false
description: |
"If a spot is inside the cell boundaries this bonus will give the likelihood an extra boost
in order to make the spot more probable to get assigned to the cell than another spot positioned
outside the cell boundaries"
direction: input
default: 2

- name: --MisreadDensity
type: double
required: false
description: |
"To account for spots far from the some a uniform distribution is introduced to describe those misreads.
By default this uniform distribution has a density of 1e-5 misreads per pixel."
direction: input
default: 0.00001

- name: --SpotReg
type: double
required: false
description: |
"Gene detection might come with irregularities due to technical errors. A small value is introduced
here to account for these errors. It is an additive factor, applied to the single cell expression
counts when the mean counts per class and per gene are calculated."
direction: input
default: 0.1

- name: --nNeighbors
type: integer
required: false
description: |
"By default only the 3 nearest cells will be considered as possible parent cells for any given spot.
There is also one extra 'super-neighbor', which is always a neighbor to the spots so we can assign
the misreads to. Could be seen as the background. Hence, by default the algorithm tries examines
whether any of the 3 nearest cells is a possible parent cell to a given cell or whether the spot is
a misread"
direction: input
default: 3

#
- name: --rSpot
type: double
required: false
description: |
"A gamma distributed variate from Gamma(rSpot, 1) is applied to the mean expression, hence the counts
are distributed according to a Negative Binomial distribution.
The value for rSpot will control the variance/dispersion of the counts"
direction: input
default: 2

- name: --save_data
type: boolean
required: false
description: "Boolean, if True the output will be saved as tsv files in a folder named 'pciSeq' in your system's temp dir."
direction: input
default: False

# output directory 'default' will save to temp location
# - name: output_path
# default: ['default']

#
# - name: --dtype
# type: string
# required: false
# description: |
# "Use either np.float16 or np.float32 to reduce memory usage. In most cases RAM consumption shouldnt
# need more than 32Gb RAM. If you have a dataset from a full coronal mouse slice with a high number of
# segmented cells (around 150,000) a gene panel of more than 250 genes and 100 or more different
# cell types (aka clusters, aka classes) in the single cell data then you might need at least 64GB on
# your machine. Changing the datatype to a float16 or float32 will help keeping RAM usage to a lower
# level"
# direction: input
# default: np.float64



resources:
- type: python_script
path: script.py

engines:
- type: docker
image: openproblems/base_python:1.0.0
__merge__:
- /src/base/setup_spatialdata_partial.yaml
- /src/base/setup_txsim_partial.yaml
setup:
- type: python
pypi: [pciseq]
- type: native

runners:
- type: executable
- type: nextflow
directives:
label: [ midtime, midcpu, midmem ]
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import numpy as np
import dask
import spatialdata as sd
import txsim as tx
import anndata as ad
import os
import shutil

## VIASH START
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
par = {
'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
'transcripts_key': 'transcripts',
'coordinate_system': 'global',
'output': '../pciSeq_assigned_transcripts.zarr',

'input_scrnaseq': 'resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad',
'sc_cell_type_key': 'cell_type',

'exclude_genes': None,
'max_iter': 1000,
'CellCallTolerance': 0.02,
'rGene': 20,
'Inefficiency': 0.2,
'InsideCellBonus': 2,
'MisreadDensity': 0.00001,
'SpotReg': 0.1,
'nNeighbors': 3,
'rSpot': 2,
'save_data': False,
'dtype': np.float64
}
meta = {
'name': 'pciSeq_transcript_assignment'
}
## VIASH END

# Read input
print('Reading input files', flush=True)
sdata = sd.read_zarr(par['input_ist'])
sdata_segm = sd.read_zarr(par['input_segmentation'])

# Check if coordinate system is available in input data
transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."

# Transform transcript coordinates to the coordinate system
print('Transforming transcripts coordinates', flush=True)
transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])

# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
transcripts = sd.transform(transcripts, trans, par['coordinate_system'])

# Assign cell ids to transcripts
print('Assigning transcripts to cell ids', flush=True)
y_coords = transcripts.y.compute().to_numpy()
x_coords = transcripts.x.compute().to_numpy()

#Added for pciSeq
#TODO this will immediately break when the name of the gene isn't feature_name
transcripts_dataframe = sdata[par['transcripts_key']].compute()[['feature_name']]
transcripts_dataframe['x'] = x_coords
transcripts_dataframe['y'] = y_coords

#same as before
label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() #TODO: mabye this line needs generalization (DataTree vs DataArray)

# Grab all the pciSeq parameters
opts_keys = [#'exclude_genes',
'max_iter',
'CellCallTolerance',
'rGene',
'Inefficiency',
'InsideCellBonus',
'MisreadDensity',
'SpotReg',
'nNeighbors',
'rSpot',
'save_data']

opts = {k: par[k] for k in opts_keys}

input_scrnaseq = ad.read_h5ad(par['input_scrnaseq'])
input_scrnaseq.X = input_scrnaseq.layers['counts']

assignments, cell_types = tx.preprocessing.run_pciSeq(
transcripts_dataframe,
label_image,
input_scrnaseq,
par['sc_cell_type_key'],
opts
)

#assign transcript -> cell
cell_id_dask_series = dask.dataframe.from_dask_array(
dask.array.from_array(
assignments['cell'].to_numpy(), chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
),
index=sdata[par['transcripts_key']].index
)

sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series

# create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
cell_types['type'] = cell_types['type'].replace({'None':'None_sp'})
cell_types.insert(0, 'cell_id', cell_types.index)
cell_types.rename(columns={'type':'cell_type','prob':'cell_type_prob'}, inplace=True)

assert 0 not in cell_types['cell_id'], "Found '0' in cell_id column of assingment output cell matrix"

output_table = ad.AnnData(
obs=cell_types[['cell_id','cell_type','cell_type_prob']],
var=sdata.tables["table"].var[[]]
)

# TODO: Also take care of the following cases:
# - segmentation 3D, transcripts 3D
# - segmentation 3D, transcripts 2D
# - segmentation 2D, transcripts 3D

# Subset sdata to transcripts with cell ids

print('Subsetting to transcripts cell id and cell type data', flush=True)
sdata_transcripts_only = sd.SpatialData(
points={
"transcripts": sdata[par['transcripts_key']]
},
tables={
"table": output_table
}
)

print('Write transcripts with cell ids and cell types', flush=True)
if os.path.exists(par["output"]):
shutil.rmtree(par["output"])
sdata_transcripts_only.write(par['output'])


0 comments on commit 778f2b8

Please sign in to comment.