Skip to content

Commit

Permalink
Feat: Feature gene annotation (#132)
Browse files Browse the repository at this point in the history
* annotate (in .uns['cnv']['var'])  which window contains which genes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove black magic comma

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* first pass at gene convolution annotation

* sorting some conflicts

* feature annotation working

* adding test to calcualte gene average

* Converting gene matrix into sparse matrix

* updating gene position to using pandas

* linting

* adding more tests

* Delete .vscode directory

* adding scanpy to conf.py

* changing scanpy.tl.pca to sc.tl.pca to get build docs to work

* Saving CNV matrix as sparse matrix

* adding back infercnv.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixing linting error

* editing code to appease ruff

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Converting gene conv to numpy array

* Making the calculation of per gene values optional

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: redst4r <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Gregor Sturm <[email protected]>
Co-authored-by: Gregor Sturm <[email protected]>
  • Loading branch information
5 people authored Jun 26, 2024
1 parent ad98e58 commit 0073690
Show file tree
Hide file tree
Showing 5 changed files with 391 additions and 33 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,9 @@ __pycache__/
/.idea/
/.vscode/

infercnv_env/
# Notebooks
.ipynb_checkpoints

# Node.js
node_modules/
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ authors = [
maintainers = [
{name = "Gregor Sturm", email="[email protected]"}
]
contributors = [
{name = "Grant Neilson", email = "[email protected]"}
]
urls.Documentation = "https://infercnvpy.readthedocs.io/"
urls.Source = "https://github.com/icbi-lab/infercnvpy"
urls.Home-page = "https://github.com/icbi-lab/infercnvpy"
Expand All @@ -30,6 +33,7 @@ dependencies = [
'pycairo>=1.20; sys_platform == "win32"',
'leidenalg',
'pyreadr',
'pytest-benchmark',
# for debug logging (referenced from the issue template)
"session-info",
"ipython"
Expand Down
207 changes: 174 additions & 33 deletions src/infercnvpy/tl/_infercnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from multiprocessing import cpu_count

import numpy as np
import pandas as pd
import scipy.ndimage
import scipy.sparse
from anndata import AnnData
Expand All @@ -30,7 +31,8 @@ def infercnv(
inplace: bool = True,
layer: str | None = None,
key_added: str = "cnv",
) -> None | tuple[dict, scipy.sparse.csr_matrix]:
calculate_gene_values: bool = False,
) -> None | tuple[dict, scipy.sparse.csr_matrix, np.ndarray | None]:
"""Infer Copy Number Variation (CNV) by averaging gene expression over genomic regions.
This method is heavily inspired by `infercnv <https://github.com/broadinstitute/inferCNV/>`_
Expand Down Expand Up @@ -79,6 +81,13 @@ def infercnv(
Key under which the cnv matrix will be stored in adata if `inplace=True`.
Will store the matrix in `adata.obsm["X_{key_added}"] and additional information
in `adata.uns[key_added]`.
calculate_gene_values
If True per gene CNVs will be calculated and stored in `adata.layers["gene_values_{key_added}"]`.
As many genes will be included in each segment the resultant per gene value will be an average of the genes included in the segment.
Additionally not all genes will be included in the per gene CNV, due to the window size and step size not always being a multiple of
the number of genes. Any genes not included in the per gene CNV will be filled with NaN.
Note this will significantly increase the memory and computation time, it is recommended to decrease the chunksize to ~100 if this is set to True.
Returns
-------
Expand Down Expand Up @@ -108,7 +117,7 @@ def infercnv(

var = tmp_adata.var.loc[:, ["chromosome", "start", "end"]] # type: ignore

chr_pos, chunks = zip(
chr_pos, chunks, convolved_dfs = zip(
*process_map(
_infercnv_chunk,
[expr[i : i + chunksize, :] for i in range(0, adata.shape[0], chunksize)],
Expand All @@ -118,19 +127,38 @@ def infercnv(
itertools.repeat(window_size),
itertools.repeat(step),
itertools.repeat(dynamic_threshold),
itertools.repeat(calculate_gene_values),
tqdm_class=tqdm,
max_workers=cpu_count() if n_jobs is None else n_jobs,
),
strict=False,
)

res = scipy.sparse.vstack(chunks)

chr_pos = chr_pos[0]

if calculate_gene_values:
per_gene_df = pd.concat(convolved_dfs, axis=0)
# Ensure the DataFrame has the correct row index
per_gene_df.index = adata.obs.index
# Ensure the per gene CNV matches the adata var (genes) index, any genes
# that are not included in the CNV will be filled with NaN
per_gene_df = per_gene_df.reindex(columns=adata.var_names, fill_value=np.nan)
# This needs to be a numpy array as colnames are too large to save in anndata
per_gene_mtx = per_gene_df.values
else:
per_gene_mtx = None

if inplace:
adata.obsm[f"X_{key_added}"] = res
adata.uns[key_added] = {"chr_pos": chr_pos[0]}
adata.uns[key_added] = {"chr_pos": chr_pos}

if calculate_gene_values:
adata.layers[f"gene_values_{key_added}"] = per_gene_mtx

else:
return chr_pos[0], res
return chr_pos, res, per_gene_mtx


def _natural_sort(l: Sequence):
Expand All @@ -148,8 +176,15 @@ def alphanum_key(key):
return sorted(l, key=alphanum_key)


def _running_mean(x: np.ndarray | scipy.sparse.spmatrix, n: int = 50, step: int = 10) -> np.ndarray:
"""Compute a pyramidially weighted running mean.
def _running_mean(
x: np.ndarray | scipy.sparse.spmatrix,
n: int = 50,
step: int = 10,
gene_list: list = None,
calculate_gene_values: bool = False,
) -> tuple[np.ndarray, pd.DataFrame | None]:
"""
Compute a pyramidially weighted running mean.
Densifies the matrix. Use `step` and `chunksize` to save memory.
Expand All @@ -160,27 +195,112 @@ def _running_mean(x: np.ndarray | scipy.sparse.spmatrix, n: int = 50, step: int
n
Length of the running window
step
only compute running windows ever `step` columns, e.g. if step is 10
0:100, 10:110, 20:120 etc. Saves memory.
only compute running windows every `step` columns, e.g. if step is 10
0:99, 10:109, 20:119 etc. Saves memory.
gene_list
List of gene names to be used in the convolution
calculate_gene_values
If True per gene CNVs will be calculated and stored in `adata.layers["gene_values_{key_added}"]`.
"""
if n < x.shape[1]: # regular convolution: the filter is smaller than the #genes
r = np.arange(1, n + 1)

pyramid = np.minimum(r, r[::-1])
smoothed_x = np.apply_along_axis(lambda row: np.convolve(row, pyramid, mode="same"), axis=1, arr=x) / np.sum(
pyramid
)
return smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]
smoothed_x = np.apply_along_axis(
lambda row: np.convolve(row, pyramid, mode="valid"),
axis=1,
arr=x,
) / np.sum(pyramid)

## get the indices of the genes used in the convolution
convolution_indices = get_convolution_indices(x, n)[np.arange(0, smoothed_x.shape[1], step)]
## Pull out the genes used in the convolution
convolved_gene_names = gene_list[convolution_indices]
smoothed_x = smoothed_x[:, np.arange(0, smoothed_x.shape[1], step)]

if calculate_gene_values:
convolved_gene_values = _calculate_gene_averages(convolved_gene_names, smoothed_x)
else:
convolved_gene_values = None

else: # there's less genes than the filtersize. just apply a single conv with a smaller filter (no sliding)
r = np.arange(1, x.shape[1] + 1)
pyramid = np.minimum(r, r[::-1])
smoothed_x = x @ pyramid.reshape(-1, 1)
smoothed_x = smoothed_x / pyramid.sum()
return smoothed_x
return smoothed_x, convolved_gene_values

else: # If there is less genes than the window size, set the window size to the number of genes and perform a single convolution
n = x.shape[1] # set the filter size to the number of genes
r = np.arange(1, n + 1)
## As we are only doing one convolution the values should be equal
pyramid = np.array([1] * n)
smoothed_x = np.apply_along_axis(
lambda row: np.convolve(row, pyramid, mode="valid"),
axis=1,
arr=x,
) / np.sum(pyramid)

if calculate_gene_values:
## As all genes are used the convolution the values are identical for all genes
convolved_gene_values = pd.DataFrame(np.repeat(smoothed_x, len(gene_list), axis=1), columns=gene_list)
else:
convolved_gene_values = None

return smoothed_x, convolved_gene_values


def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.ndarray]:
def _calculate_gene_averages(
convolved_gene_names: np.ndarray,
smoothed_x: np.ndarray,
) -> pd.DataFrame:
"""
Calculate the average value of each gene in the convolution
Parameters
----------
convolved_gene_names
A numpy array with the gene names used in the convolution
smoothed_x
A numpy array with the smoothed gene expression values
Returns
-------
convolved_gene_values
A DataFrame with the average value of each gene in the convolution
"""
## create a dictionary to store the gene values per sample
gene_to_values = {}
# Calculate the number of genes in each convolution, will be same as the window size default=100
length = len(convolved_gene_names[0])
# Convert the flattened convolved gene names to a list
flatten_list = list(convolved_gene_names.flatten())

# For each sample in smoothed_x find the value for each gene and store it in a dictionary
for sample, row in enumerate(smoothed_x):
# Create sample level in the dictionary
if sample not in gene_to_values:
gene_to_values[sample] = {}
# For each gene in the flattened gene list find the value and store it in the dictionary
for i, gene in enumerate(flatten_list):
if gene not in gene_to_values[sample]:
gene_to_values[sample][gene] = []
# As the gene list has been flattend we can use the floor division of the index
# to get the correct position of the gene to get the value and store it in the dictionary
gene_to_values[sample][gene].append(row[i // length])

for sample in gene_to_values:
for gene in gene_to_values[sample]:
gene_to_values[sample][gene] = np.mean(gene_to_values[sample][gene])

convolved_gene_values = pd.DataFrame(gene_to_values).T
return convolved_gene_values


def get_convolution_indices(x, n):
indices = []
for i in range(x.shape[1] - n + 1):
indices.append(np.arange(i, i + n))
return np.array(indices)


def _running_mean_by_chromosome(
expr, var, window_size, step, calculate_gene_values
) -> tuple[dict, np.ndarray, pd.DataFrame | None]:
"""Compute the running mean for each chromosome independently. Stack the resulting arrays ordered by chromosome.
Parameters
Expand All @@ -206,16 +326,31 @@ def _running_mean_by_chromosome(expr, var, window_size, step) -> tuple[dict, np.
"""
chromosomes = _natural_sort([x for x in var["chromosome"].unique() if x.startswith("chr") and x != "chrM"])

def _running_mean_for_chromosome(chr):
genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
tmp_x = expr[:, var.index.get_indexer(genes)]
return _running_mean(tmp_x, n=window_size, step=step)
running_means = [
_running_mean_for_chromosome(chr, expr, var, window_size, step, calculate_gene_values) for chr in chromosomes
]

running_means = [_running_mean_for_chromosome(chr) for chr in chromosomes]
running_means, convolved_dfs = zip(*running_means, strict=False)

chr_start_pos = dict(zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]), strict=False))
chr_start_pos = {}
for chr, i in zip(chromosomes, np.cumsum([0] + [x.shape[1] for x in running_means]), strict=False):
chr_start_pos[chr] = i

return chr_start_pos, np.hstack(running_means)
## Concatenate the gene dfs
if calculate_gene_values:
convolved_dfs = pd.concat(convolved_dfs, axis=1)

return chr_start_pos, np.hstack(running_means), convolved_dfs


def _running_mean_for_chromosome(chr, expr, var, window_size, step, calculate_gene_values):
genes = var.loc[var["chromosome"] == chr].sort_values("start").index.values
tmp_x = expr[:, var.index.get_indexer(genes)]
x_conv, convolved_gene_values = _running_mean(
tmp_x, n=window_size, step=step, gene_list=genes, calculate_gene_values=calculate_gene_values
)

return x_conv, convolved_gene_values


def _get_reference(
Expand Down Expand Up @@ -264,7 +399,7 @@ def _get_reference(
return reference


def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_threshold):
def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_threshold, calculate_gene_values=False):
"""The actual infercnv work is happening here.
Process chunks of serveral thousand genes independently since this
Expand All @@ -291,17 +426,23 @@ def _infercnv_chunk(tmp_x, var, reference, lfc_cap, window_size, step, dynamic_t
# Step 2 - clip log fold changes
x_clipped = np.clip(x_centered, -lfc_cap, lfc_cap)
# Step 3 - smooth by genomic position
chr_pos, x_smoothed = _running_mean_by_chromosome(x_clipped, var, window_size=window_size, step=step)
chr_pos, x_smoothed, conv_df = _running_mean_by_chromosome(
x_clipped, var, window_size=window_size, step=step, calculate_gene_values=calculate_gene_values
)
# Step 4 - center by cell
x_cell_centered = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
x_res = x_smoothed - np.median(x_smoothed, axis=1)[:, np.newaxis]
if calculate_gene_values:
gene_res = conv_df - np.median(conv_df, axis=1)[:, np.newaxis]
else:
gene_res = None

# Noise filtering
x_res = x_cell_centered
# step 5 - standard deviation based noise filtering
if dynamic_threshold is not None:
noise_thres = dynamic_threshold * np.std(x_res)
x_res[np.abs(x_res) < noise_thres] = 0
if calculate_gene_values:
gene_res[np.abs(gene_res) < noise_thres] = 0

x_res = scipy.sparse.csr_matrix(x_res)

return chr_pos, x_res
return chr_pos, x_res, gene_res
50 changes: 50 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,56 @@ def adata_mock(request):
return adata


@pytest.fixture
def adata_full_mock():
np.random.seed(0) # Set the seed to a fixed value
obs = pd.DataFrame().assign(sample=["sample1", "sample2", "sample3", "sample4"]) # Added "sample4"
var = pd.DataFrame().assign(
gene=["gene1", "gene2", "gene3", "gene4", "gene5", "gene6", "gene7", "gene8", "gene9", "gene10"],
start=[100, 200, 300, 400, 500, 0, 100, 200, 300, 400],
end=[199, 299, 399, 499, 599, 99, 199, 299, 399, 499],
chromosome=["chr1", "chr1", "chr1", "chr1", "chr1", "chr2", "chr2", "chr2", "chr2", "chr2"],
)
var.index = var["gene"]
X = sp.csr_matrix(np.random.randint(low=0, high=50, size=(4, 10))) # Changed size to (4, 10)
adata = sc.AnnData(X=X, obs=obs, var=var)

return adata


@pytest.fixture
def gene_res_actual():
df = pd.DataFrame(
{
"gene1": [0.75, -1.00, 0.00, 0.00],
"gene2": [0.00, 0.00, 0.75, 0.00],
"gene3": [0.000000, 0.000000, 0.91666667, 0.000000],
"gene4": [0.00, 0.00, 1.25, 0.00],
"gene5": [-0.75, 0.00, 1.25, 0.00],
"gene6": [0.000000, 0.000000, 0.000000, 0.921875],
"gene7": [0.000000, 0.000000, 0.000000, 0.703125],
"gene8": [0.0, 0.0, 0.0, 0.0],
"gene9": [0.0, 0.0, 0.0, 0.0],
"gene10": [0.75, 0.00, 0.00, 0.00],
}
)
df.index = df.index.astype(str)
return df


@pytest.fixture
def x_res_actual():
arr = np.array(
[
[1.00, 0.00, 0.00, 0.00, 0.00, 1.00],
[-1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
[0.00, 1.25, 1.25, 0.00, 0.00, 0.00],
[0.00, 0.00, 0.00, 0.875, 0.00, 0.00],
]
)
return arr


@pytest.fixture(params=[np.array, sp.csr_matrix, sp.csc_matrix])
def adata_ithgex(request):
return sc.AnnData(
Expand Down
Loading

0 comments on commit 0073690

Please sign in to comment.